diff --git a/.gitignore b/.gitignore index 4ff369a0..556c02c4 100644 --- a/.gitignore +++ b/.gitignore @@ -147,7 +147,7 @@ CTestTestfile.cmake ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ -*.py[cod] +*.py[od] *$py.class # C extensions diff --git a/CMakeLists.txt b/CMakeLists.txt index 225beaec..5adaa5f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,8 +96,8 @@ IF(WIN32) SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi") - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819") ENDIF() ENDIF() @@ -124,9 +124,6 @@ endif() if(MNN_SUPPORT_TFLITE_QUAN) add_definitions(-DMNN_SUPPORT_TFLITE_QUAN) endif() -if(MNN_BUILD_MINI) - add_definitions(-DMNN_BUILD_MINI) -endif() # debug options if(MNN_DEBUG_MEMORY) @@ -156,6 +153,12 @@ if (MNN_USE_THREAD_POOL) add_definitions(-DMNN_USE_THREAD_POOL) endif() +# When build Android based on arm32 by MTL, force turn off MNN_ARM82 +if (CMAKE_SYSTEM_NAME MATCHES "^Android" AND CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" AND NOT MNN_BUILD_FOR_ANDROID_COMMAND) + message(STATUS "force turn off MNN_ARM82 when build for Android based on arm32 by MTL") + SET(MNN_ARM82 OFF CACHE BOOL "Enable ARM82" FORCE) +endif() + # target options option(MNN_BUILD_BENCHMARK "Build benchmark or not" OFF) option(MNN_BUILD_TEST "Build tests or not" OFF) @@ -181,6 +184,7 @@ message(STATUS "\toneDNN: ${MNN_ONEDNN}") message(STATUS "\tTensorRT: ${MNN_TENSORRT}") message(STATUS "\tCUDA: ${MNN_CUDA}") message(STATUS "\tOpenMP: ${MNN_OPENMP}") +message(STATUS "\tBF16: ${MNN_SUPPORT_BF16}") message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}") message(STATUS "\tHidden: ${MNN_HIDDEN}") message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}") @@ -306,6 +310,9 @@ FILE(GLOB MNN_Core_SRC ${CMAKE_CURRENT_LIST_DIR}/source/core/*) add_library(MNNCore OBJECT ${MNN_Core_SRC}) list(APPEND MNN_OBJECTS_TO_LINK $) list(APPEND MNN_TARGETS MNNCore) +if(MNN_BUILD_MINI) + target_compile_options(MNNCore PRIVATE -DMNN_BUILD_MINI) +endif() # CV FILE(GLOB MNN_CV_SRC ${CMAKE_CURRENT_LIST_DIR}/source/cv/*) @@ -340,23 +347,8 @@ add_library(MNNUtils OBJECT ${MNN_Utils_SRC}) list(APPEND MNN_OBJECTS_TO_LINK $) list(APPEND MNN_TARGETS MNNUtils) -# CPU -FILE(GLOB MNN_CPU_SRC ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/* ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/compute/*) -add_library(MNNCPU OBJECT ${MNN_CPU_SRC}) -list(APPEND MNN_OBJECTS_TO_LINK $) -list(APPEND MNN_TARGETS MNNCPU) +include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/CMakeLists.txt) -# X86_64 AVX/SSE -if (MNN_USE_SSE) -include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/x86_x64/CMakeLists.txt) -endif() - -# AArch32/64 Assemblies -include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/arm/CMakeLists.txt) - -IF(NOT DEFINED IOS_ARCH) - set(IOS_ARCH "") -ENDIF() SET(MNN_PUB_HDRS "") SET(MNN_EXPR_PUB_HDRS "") @@ -513,16 +505,6 @@ IF(MNN_CUDA) list(APPEND MNN_EXTRA_DEPENDS ${MNN_CUDA_LIBS}) ENDIF() -IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") -# ARM82 Assemblies - IF(MNN_ARM82) - add_definitions(-DENABLE_ARMV82) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/arm82/) - list(APPEND MNN_TARGETS MNN_Arm82) - list(APPEND MNN_OBJECTS_TO_LINK $) - ENDIF() -ENDIF() - # Express add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/express/) IF(MNN_SEP_BUILD) diff --git a/benchmark/bench_android.sh b/benchmark/bench_android.sh index 6251b1a2..5c7dcdfb 100755 --- a/benchmark/bench_android.sh +++ b/benchmark/bench_android.sh @@ -81,7 +81,7 @@ function bench_android() { #benchmark OpenGL #adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models $RUN_LOOP 5 6 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt" #benchmark OpenCL - #adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models $RUN_LOOP 5 3 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt" + #adb shell "LD_LIBRARY_PATH=$ANDROID_DIR $ANDROID_DIR/benchmark.out $ANDROID_DIR/benchmark_models 100 20 3 2>$ANDROID_DIR/benchmark.err >> $ANDROID_DIR/benchmark.txt" adb pull $ANDROID_DIR/benchmark.txt ../ } diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 6152738b..f6fa02d7 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -54,7 +54,7 @@ std::vector findModelFiles(const char* dir) { #if defined(_MSC_VER) WIN32_FIND_DATA ffd; HANDLE hFind = INVALID_HANDLE_VALUE; - std::string mnn_model_pattern = std::string(dir) + "\\*.mnn"; + std::string mnn_model_pattern = std::string(dir) + "\\*.mnn"; hFind = FindFirstFile(mnn_model_pattern.c_str(), &ffd); if (INVALID_HANDLE_VALUE == hFind) { std::cout << "open " << dir << " failed: " << strerror(errno) << std::endl; @@ -178,7 +178,7 @@ void displayStats(const std::string& name, const std::vector& costs) { //printf("[ - ] cost:%f ms\n", v); } avg = costs.size() > 0 ? sum / costs.size() : 0; - printf("[ - ] %-24s max = %8.3fms min = %8.3fms avg = %8.3fms\n", name.c_str(), max, avg == 0 ? 0 : min, avg); + printf("[ - ] %-24s max = %8.3f ms min = %8.3f ms avg = %8.3f ms\n", name.c_str(), max, avg == 0 ? 0 : min, avg); } static inline std::string forwardType(MNNForwardType type) { switch (type) { @@ -318,7 +318,7 @@ void set_cpu_affinity() int cpu_id = 0; cpu_set_t mask; CPU_ZERO(&mask); - + auto numberOfCPUs = getNumberOfCPU(); static std::vector sortedCPUIDs; static int littleClusterOffset = 0; @@ -379,10 +379,10 @@ int main(int argc, const char* argv[]) { std::vector models = findModelFiles(argv[1]); std::cout << "--------> Benchmarking... loop = " << argv[2] << ", warmup = " << warmup << std::endl; - + /* not called yet */ // set_cpu_affinity(); - + for (auto& m : models) { std::vector costs = doBench(m, loop, warmup, forward, false, numberThread, precision); displayStats(m.name, costs); diff --git a/benchmark/models/mobilenetV3.mnn b/benchmark/models/mobilenetV3.mnn new file mode 100644 index 00000000..04ba7512 Binary files /dev/null and b/benchmark/models/mobilenetV3.mnn differ diff --git a/benchmark/models/nasnet.mnn b/benchmark/models/nasnet.mnn new file mode 100644 index 00000000..d703e858 Binary files /dev/null and b/benchmark/models/nasnet.mnn differ diff --git a/benchmark/models/squeezenetv1.1.mnn b/benchmark/models/squeezenetv1.1.mnn new file mode 100644 index 00000000..7985f433 Binary files /dev/null and b/benchmark/models/squeezenetv1.1.mnn differ diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt index 482e41e4..eda621dc 100644 --- a/codegen/CMakeLists.txt +++ b/codegen/CMakeLists.txt @@ -6,7 +6,9 @@ option(MNN_CODEGEN_JIT "Build jit for codegen." OFF) file(GLOB CODEGEN_HEADER "${CMAKE_CURRENT_LIST_DIR}/*.*") file(GLOB CPU_SRCS "${CMAKE_CURRENT_LIST_DIR}/cpu/*.*") +file(GLOB JIT_SRCS "${CMAKE_CURRENT_LIST_DIR}/jit/*.*") list(APPEND MNN_CODEGEN_SRCS ${CODEGEN_HEADER}) +list(APPEND MNN_CODEGEN_SRCS ${JIT_SRCS}) if(MNN_CODEGEN_OPENCL) add_definitions(-DMNN_CODEGEN_OPENCL) @@ -34,7 +36,7 @@ if(MNN_CODEGEN_LLVM) message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) - llvm_map_components_to_libnames(llvm_libs core bitwriter) + llvm_map_components_to_libnames(llvm_libs core bitwriter OrcJIT Support nativecodegen native CodeGen) list(APPEND MNN_EXTRA_DEPENDS ${llvm_libs}) endif() diff --git a/codegen/OpFuse.cpp b/codegen/OpFuse.cpp index 3330eee1..44aad255 100644 --- a/codegen/OpFuse.cpp +++ b/codegen/OpFuse.cpp @@ -9,9 +9,11 @@ #include "OpFuse.hpp" #include "geometry/GeometryComputerUtils.hpp" #include "PluginModule.hpp" -#include #include #include +#include "cpu/CPUAst.hpp" +#include "jit/LLVMJit.hpp" + #if !defined(_MSC_VER) #include #endif @@ -73,6 +75,7 @@ bool isLegal(const Command* cmd) { if (elemWise) { return true; } +#define fuse_raster #ifdef fuse_raster if (type == OpType_Raster) { auto outputFormat = TensorUtils::getDescribe(cmd->outputs[0])->dimensionFormat; @@ -134,6 +137,136 @@ std::vector fuseNode(Node* root, std::vector& edges) { } return fuseSet; } + +void codegen(CommandBuffer& cmd, std::vector>& fuseSets) { + // generate Kernel + CPUPluginModule plugin("codegen_demo"); + for (auto compSet : fuseSets) { + // printf("set size: %lu \n", compSet.size()); + InOutTensors tensors = plugin.addFunction(compSet); + auto inputs = tensors.first; + auto outputs = tensors.second; + // build Plugin Op + Command cmdPlugin; + { + std::unique_ptr pluginOp(new OpT); + pluginOp->type = OpType_Plugin; + pluginOp->name = "PluginWrapper"; + PluginT* plugin_param = new PluginT; + plugin_param->type = "PluginWrapper"; + plugin_param->attr.resize(1); + plugin_param->attr[0].reset(new AttributeT); + plugin_param->attr[0]->key = "kernel"; + plugin_param->attr[0]->i = plugin.getFunctionNum()-1; + pluginOp->main.type = OpParameter_Plugin; + pluginOp->main.value = plugin_param; + flatbuffers::FlatBufferBuilder builder; + auto lastOffset = Op::Pack(builder, pluginOp.get()); + builder.Finish(lastOffset); + cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs); + } + for (int i = 0; i < compSet.size(); i++) { + auto cmd = const_cast(compSet[i]->cmd); + if (i == compSet.size()-1) { + cmd->op = cmdPlugin.op; + cmd->inputs = cmdPlugin.inputs; + cmd->outputs = cmdPlugin.outputs; + cmd->buffer = cmdPlugin.buffer; + } else { + cmd->op = nullptr; + cmd->buffer.clear(); + } + } + } + // printf("total: %d\n", idx); + plugin.codegen(); + // printf("cmd num: %lu \n", cmd.command.size()); + for (auto iter = cmd.command.begin(); iter != cmd.command.end();) { + if (iter->op == nullptr) { + iter = cmd.command.erase(iter); + } else { + ++iter; + } + } +#if !defined(_MSC_VER) + // printf("cmd num: %lu \n", cmd.command.size()); + dlopen("./libplugin_fuse.so", RTLD_NOW | RTLD_LOCAL); +#endif +} + +void jit(CommandBuffer& cmd, std::vector>& fuseSets) { + LLVMJIT* theJit = LLVMJIT::createLLVMJIT(); + CPUPluginModule plugin("jit_demo"); + std::string kernelStr; + for (auto compSet : fuseSets) { + /* + // printf("set size: %lu \n", compSet.size()); + if (true) { + for (auto com : compSet) { + // json : + // { fusedOps: [ { idx:int, srcOps: [name: string], inputs:[name:string], outputs:[name:string] } ], dynlib:string, jitObj:string, module:string } + dumpCmd(com->cmd); + } + } + */ + kernelStr += "["; + for (auto com : compSet) { + kernelStr += com->cmd->op->name()->str(); + } + kernelStr += "]"; + InOutTensors tensors = plugin.addFunction(compSet); + auto inputs = tensors.first; + auto outputs = tensors.second; + // build Plugin Op + Command cmdPlugin; + { + std::unique_ptr pluginOp(new OpT); + pluginOp->type = OpType_Plugin; + pluginOp->name = "JitPluginWrapper"; + PluginT* plugin_param = new PluginT; + plugin_param->type = "JitPluginWrapper"; + plugin_param->attr.resize(1); + plugin_param->attr[0].reset(new AttributeT); + plugin_param->attr[0]->key = "kernel"; + plugin_param->attr[0]->i = plugin.getFunctionNum() - 1; + pluginOp->main.type = OpParameter_Plugin; + pluginOp->main.value = plugin_param; + flatbuffers::FlatBufferBuilder builder; + auto lastOffset = Op::Pack(builder, pluginOp.get()); + builder.Finish(lastOffset); + cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs); + } + for (int i = 0; i < compSet.size(); i++) { + auto cmd = const_cast(compSet[i]->cmd); + if (i == compSet.size()-1) { + cmd->op = cmdPlugin.op; + cmd->inputs = cmdPlugin.inputs; + cmd->outputs = cmdPlugin.outputs; + cmd->buffer = cmdPlugin.buffer; + } else { + cmd->op = nullptr; + cmd->buffer.clear(); + } + } + } + for (auto iter = cmd.command.begin(); iter != cmd.command.end();) { + if (iter->op == nullptr) { + iter = cmd.command.erase(iter); + } else { + ++iter; + } + } + size_t id = std::hash()(kernelStr); + std::unique_ptr target(new LLVMTarget("jit-kenerl-" + std::to_string(id))); + target->getModule()->setDataLayout(theJit->getDataLayout()); + plugin.codegen(target.get()); + // add module to JIT and compile + auto m = target->getThreadSafeModule(); + auto resourceTracker = theJit->getMainJITDylib().createResourceTracker(); + theJit->addModule(std::move(m), resourceTracker); + theJit->compileAllFunction(plugin.getFunctionNum()); +} + bool opFuse(CommandBuffer& cmd) { std::unordered_map outputTensor; // build graph @@ -208,59 +341,7 @@ bool opFuse(CommandBuffer& cmd) { postDominateNodeQueue.push(child); } } - // generate Kernel - CPUPluginModule plugin("fuse_demo"); - for (auto compSet : fuseSets) { - // printf("set size: %lu \n", compSet.size()); - InOutTensors tensors = plugin.addFunction(compSet); - auto inputs = tensors.first; - auto outputs = tensors.second; - // build Plugin Op - Command cmdPlugin; - { - std::unique_ptr pluginOp(new OpT); - pluginOp->type = OpType_Plugin; - pluginOp->name = "PluginWrapper"; - PluginT* plugin_param = new PluginT; - plugin_param->type = "PluginWrapper"; - plugin_param->attr.resize(1); - plugin_param->attr[0].reset(new AttributeT); - plugin_param->attr[0]->key = "kernel"; - plugin_param->attr[0]->i = plugin.getFunctionNum()-1; - pluginOp->main.type = OpParameter_Plugin; - pluginOp->main.value = plugin_param; - flatbuffers::FlatBufferBuilder builder; - auto lastOffset = Op::Pack(builder, pluginOp.get()); - builder.Finish(lastOffset); - cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs); - } - for (int i = 0; i < compSet.size(); i++) { - auto cmd = const_cast(compSet[i]->cmd); - if (i == compSet.size()-1) { - cmd->op = cmdPlugin.op; - cmd->inputs = cmdPlugin.inputs; - cmd->outputs = cmdPlugin.outputs; - cmd->buffer = cmdPlugin.buffer; - } else { - cmd->op = nullptr; - cmd->buffer.clear(); - } - } - } - // printf("total: %d\n", idx); - plugin.codegen(); - // printf("cmd num: %lu \n", cmd.command.size()); - for (auto iter = cmd.command.begin(); iter != cmd.command.end();) { - if (iter->op == nullptr) { - iter = cmd.command.erase(iter); - } else { - ++iter; - } - } -#if !defined(_MSC_VER) - // printf("cmd num: %lu \n", cmd.command.size()); - dlopen("./libplugin_fuse.so", RTLD_NOW | RTLD_LOCAL); -#endif + jit(cmd, fuseSets); return true; } } // namespace MNN diff --git a/codegen/PluginModule.hpp b/codegen/PluginModule.hpp index a13785a2..d0a857b1 100644 --- a/codegen/PluginModule.hpp +++ b/codegen/PluginModule.hpp @@ -38,6 +38,7 @@ public: virtual void codegen() = 0; }; +class LLVMTarget; #ifdef MNN_CODEGEN_CPU class CPUPluginModule : PluginModule{ public: @@ -49,6 +50,7 @@ public: InOutTensors addFunction(std::vector nodes) override; const int getFunctionNum() override { return functions.size(); } void codegen() override; + void codegen(LLVMTarget* target); private: class CPUPluginFunction; std::vector> functions; diff --git a/codegen/cpu/CPUAst.hpp b/codegen/cpu/CPUAst.hpp index fcf1ab23..fd7b4c68 100644 --- a/codegen/cpu/CPUAst.hpp +++ b/codegen/cpu/CPUAst.hpp @@ -21,47 +21,45 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" +#include "llvm/ExecutionEngine/Orc/LLJIT.h" using namespace llvm; +using namespace llvm::orc; #endif -class Target { -public: - Target() {} - virtual ~Target() {} -private: - std::string name; -}; - #ifdef MNN_CODEGEN_LLVM -class LLVMTarget : public Target { +class LLVMTarget { public: - LLVMTarget(std::string& name) { - llvmBuilder = std::make_unique>(llvmContext); - llvmModule = std::make_unique(name, llvmContext); - llvmModule->setTargetTriple("x86_64-apple-macosx10.15.0"); + LLVMTarget(std::string name) { + llvmContext.reset(new LLVMContext); + llvmBuilder = std::make_unique>(*llvmContext.get()); + llvmModule = std::make_unique(name, *llvmContext.get()); + llvmModule->setTargetTriple("x86_64-apple-macosx11.0.0"); } - ~LLVMTarget() override {} + ~LLVMTarget() {} Module* getModule() { return llvmModule.get(); } LLVMContext& getContext() { - return llvmContext; + return *llvmContext.get(); } IRBuilder<>* getBuilder() { return llvmBuilder.get(); } + ThreadSafeModule getThreadSafeModule() { + return ThreadSafeModule(std::move(llvmModule), std::move(llvmContext)); + } private: - LLVMContext llvmContext; + std::unique_ptr llvmContext; std::unique_ptr> llvmBuilder; std::unique_ptr llvmModule; }; #endif #ifdef MNN_CODEGEN_C -class SourceTarget : public Target { +class SourceTarget { public: SourceTarget() {} - ~SourceTarget() override {} + ~SourceTarget() {} void addIndent() { indent++; } void subIndent() { indent--; } std::string getIndent() { @@ -74,7 +72,7 @@ private: class CTarget : public SourceTarget { public: CTarget(std::string& name) {} - ~CTarget() override {} + ~CTarget() {} }; #endif diff --git a/codegen/cpu/CPUPluginModule.cpp b/codegen/cpu/CPUPluginModule.cpp index 2ea0d067..05213f5c 100644 --- a/codegen/cpu/CPUPluginModule.cpp +++ b/codegen/cpu/CPUPluginModule.cpp @@ -233,6 +233,12 @@ private: std::unique_ptr function; }; +void CPUPluginModule::codegen(LLVMTarget* target) { + for (int i = 0; i < getFunctionNum(); i++) { + functions[i]->codegen(target); + } +} + void CPUPluginModule::codegen() { std::ofstream headerFile("./kernel.h"); std::ofstream sourceFile("./kernel.c"); diff --git a/codegen/jit/JitPluginWrapper.cpp b/codegen/jit/JitPluginWrapper.cpp new file mode 100644 index 00000000..f82992ed --- /dev/null +++ b/codegen/jit/JitPluginWrapper.cpp @@ -0,0 +1,56 @@ +// +// JitPluginWrapper.cpp +// Codegen +// +// Created by MNN on 2021/01/29. +// Copyright © 2018, Alibaba Group Holding Limited +// +#include "jit/LLVMJit.hpp" +#include "MNN/plugin/PluginKernel.hpp" +#include "cpu/CPUAst.hpp" +#include + +MNN_PUBLIC int _intPluginWrapper = 10; // Just for linking successfully. + +using namespace llvm; +using namespace llvm::orc; + +namespace MNN { +namespace plugin { + +namespace backend { +class JitPluginWrapper : public CPUComputeKernel { +public: + bool init(CPUKernelContext*) override { return true; } + bool compute(CPUKernelContext* ctx) override; +}; + +bool JitPluginWrapper::compute(CPUKernelContext* ctx) { + int kernelIdx = 0; + if (ctx->hasAttr("kernel")) { + kernelIdx = ctx->getAttr("kernel")->i(); + } + + LLVMJIT* jit = LLVMJIT::createLLVMJIT(); + MNN_ASSERT(jit != nullptr); + + int I = ctx->inputs().size(); + float** inputs = new float*[I]; + for (int i = 0; i < I; i++) { + inputs[i] = reinterpret_cast(ctx->input(i)->buffer().host); + } + int O = ctx->outputs().size(); + float** outputs = new float*[O]; + for (int i = 0; i < O; i++) { + outputs[i] = reinterpret_cast(ctx->output(i)->buffer().host); + } + void (*kernel)(float**, float**) = (void (*)(float**, float**))jit->getFuncByIdx(kernelIdx); + kernel(inputs, outputs); + return true; +} +} // namespace backend + +REGISTER_PLUGIN_COMPUTE_KERNEL(JitPluginWrapper, backend::JitPluginWrapper); + +} // namespace plugin +} // namespace MNN diff --git a/codegen/jit/LLVMJit.cpp b/codegen/jit/LLVMJit.cpp new file mode 100644 index 00000000..ca12a711 --- /dev/null +++ b/codegen/jit/LLVMJit.cpp @@ -0,0 +1,187 @@ +// +// LLVMJit.cpp +// MNN +// +// Created by MNN on 2021/2/2. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "jit/LLVMJit.hpp" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/CodeGen/CommandFlags.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" + +#include "llvm/ExecutionEngine/ObjectCache.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" +#include +class MCJITObjectCache : public ObjectCache { +public: + MCJITObjectCache() { + sys::fs::current_path(CacheDir); + sys::path::append(CacheDir, "mnn_object_cache"); + } + + virtual ~MCJITObjectCache() {} + + bool isCached(std::string moduleId) { + SmallString<128> IRCacheFile = CacheDir; + sys::path::append(IRCacheFile, moduleId); + return sys::fs::exists(IRCacheFile.str()); + } + + virtual void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) { + const std::string ModuleID = M->getModuleIdentifier(); + + if (0 == ModuleID.compare(0, 4, "jit-")) { + std::string IRFileName = ModuleID; + SmallString<128>IRCacheFile = CacheDir; + sys::path::append(IRCacheFile, IRFileName); + if (!sys::fs::exists(CacheDir.str()) && sys::fs::create_directory(CacheDir.str())) { + fprintf(stderr, "Unable to create cache directory\n"); + return; + } + std::error_code ec; + raw_fd_ostream IRObjectFile(IRCacheFile.c_str(), ec, sys::fs::F_None); + IRObjectFile << Obj.getBuffer(); + } + } + + virtual std::unique_ptr getObject(const Module* M) { + if (!isCached(M->getModuleIdentifier())) { + return nullptr; + } + SmallString<128> IRCacheFile = CacheDir; + sys::path::append(IRCacheFile, M->getModuleIdentifier()); + ErrorOr> IRObjectBuffer = MemoryBuffer::getFile(IRCacheFile.c_str(), -1, false); + if (!IRObjectBuffer) { + return nullptr; + } + return MemoryBuffer::getMemBufferCopy(IRObjectBuffer.get()->getBuffer()); + } + +private: + SmallString<128> CacheDir; +}; + +static MCJITObjectCache cacheObj; +LLVMJIT* LLVMJIT::llvmJit = nullptr; + +LLVMJIT::LLVMJIT(std::unique_ptr tpc, std::unique_ptr es, JITTargetMachineBuilder jtmb, DataLayout dl) + : processControl(std::move(tpc)), executionSession(std::move(es)), dataLayout(std::move(dl)), + mangle(*this->executionSession, this->dataLayout), + objectLayer(*this->executionSession, []() { return std::make_unique(); }), + compileLayer(*this->executionSession, objectLayer, std::make_unique(std::move(jtmb))), + optimizeLayer(*this->executionSession, compileLayer, optimizeModule), + mainJD(this->executionSession->createBareJITDylib("
")) { + mainJD.addGenerator(cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(dl.getGlobalPrefix()))); +} + +LLVMJIT::~LLVMJIT() { + if (auto Err = executionSession->endSession()) { + executionSession->reportError(std::move(Err)); + } +} + +void LLVMJIT::addModule(ThreadSafeModule tsm, ResourceTrackerSP rt) { + if (!rt) { + rt = mainJD.getDefaultResourceTracker(); + } + ExitOnErr(optimizeLayer.add(rt, std::move(tsm))); +} + +Expected LLVMJIT::lookup(StringRef Name) { + return executionSession->lookup({&mainJD}, mangle(Name.str())); +} + +void LLVMJIT::compileAllFunction(int num) { + auto comp = static_cast(&compileLayer.getCompiler()); + comp->setObjectCache(&cacheObj); + functions.resize(num); + for (int i = 0; i < num; i++) { + functions[i] = getFuncByName("kernel_" + std::to_string(i)); + } +} + +uint64_t LLVMJIT::getFuncByName(std::string name) { + return ExitOnErr(lookup(name)).getAddress(); +} + +uint64_t LLVMJIT::getFuncByIdx(int idx) { + if (functions.size() <= idx) { + return 0; + } + return functions[idx]; +} + +LLVMJIT* LLVMJIT::createLLVMJIT() { + if (llvmJit != nullptr) { + return llvmJit; + } + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + InitializeNativeTargetAsmParser(); + auto tpc = SelfTargetProcessControl::Create(); + if (!tpc) { + return nullptr; + } + auto es = std::make_unique(); + JITTargetMachineBuilder jtmb((*tpc)->getTargetTriple()); + auto dl = jtmb.getDefaultDataLayoutForTarget(); + if (!dl) { + return nullptr; + } + llvmJit = new LLVMJIT(std::move(*tpc), std::move(es), std::move(jtmb), std::move(*dl)); + return llvmJit; +} + +TargetMachine* LLVMJIT::GetTargetMachine(Triple TheTriple) { + std::string Error; + const Target *TheTarget = TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error); + if (!TheTarget) { + return nullptr; + } + return TheTarget->createTargetMachine(TheTriple.getTriple(), codegen::getCPUStr(), codegen::getFeaturesStr(), codegen::InitTargetOptionsFromCodeGenFlags(TheTriple), + codegen::getExplicitRelocModel(), codegen::getExplicitCodeModel(), CodeGenOpt::Aggressive); +} + +Expected LLVMJIT::optimizeModule(ThreadSafeModule tsm, const MaterializationResponsibility &mr) { + static codegen::RegisterCodeGenFlags CFG; + tsm.withModuleDo([](Module &m) { + if (cacheObj.isCached(m.getModuleIdentifier())) { + return; + } + auto modulePassManager = std::make_unique(); + auto funcPassManager = std::make_unique(&m); + { + Triple moduleTriple(m.getTargetTriple()); + TargetMachine *Machine = nullptr; + if (moduleTriple.getArch()) { + Machine = GetTargetMachine(moduleTriple); + } + modulePassManager->add(createTargetTransformInfoWrapperPass(Machine->getTargetIRAnalysis())); + funcPassManager->add(createTargetTransformInfoWrapperPass(Machine->getTargetIRAnalysis())); + PassManagerBuilder builder; + builder.OptLevel = 3; + builder.SizeLevel = 0; + // builder.Inliner = createFunctionInliningPass(3, 0, false); + builder.DisableUnrollLoops = false; + builder.LoopVectorize = true; + builder.SLPVectorize = true; + builder.populateFunctionPassManager(*funcPassManager.get()); + builder.populateModulePassManager(*modulePassManager.get()); + funcPassManager->doInitialization(); + for (auto &function : m) { + funcPassManager->run(function); + } + funcPassManager->doFinalization(); + modulePassManager->run(m); + } + }); + return std::move(tsm); +} diff --git a/codegen/jit/LLVMJit.hpp b/codegen/jit/LLVMJit.hpp new file mode 100644 index 00000000..5d31a0ac --- /dev/null +++ b/codegen/jit/LLVMJit.hpp @@ -0,0 +1,60 @@ +// +// LLVMJit.hpp +// MNN +// +// Created by MNN on 2021/2/2. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "llvm/IR/DataLayout.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/TargetProcessControl.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h" + +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::orc; + +class LLVMJIT { +public: + LLVMJIT(std::unique_ptr tpc, std::unique_ptr es, JITTargetMachineBuilder jtmb, DataLayout dl); + + ~LLVMJIT(); + + static LLVMJIT* createLLVMJIT(); + + const DataLayout &getDataLayout() const { return dataLayout; } + + JITDylib &getMainJITDylib() { return mainJD; } + + void addModule(ThreadSafeModule tsm, ResourceTrackerSP rt = nullptr); + + Expected lookup(StringRef Name); + + void compileAllFunction(int num); + + uint64_t getFuncByName(std::string name); + + uint64_t getFuncByIdx(int idx); +private: + static TargetMachine* GetTargetMachine(Triple TheTriple); + static Expected optimizeModule(ThreadSafeModule tsm, const MaterializationResponsibility &mr); +private: + std::unique_ptr processControl; + std::unique_ptr executionSession; + std::vector functions; + RTDyldObjectLinkingLayer objectLayer; + IRCompileLayer compileLayer; + IRTransformLayer optimizeLayer; + DataLayout dataLayout; + MangleAndInterner mangle; + JITDylib &mainJD; + ExitOnError ExitOnErr; + Triple targetTriple; + static LLVMJIT* llvmJit; +}; diff --git a/demo/exec/segment.cpp b/demo/exec/segment.cpp index 3a607cc6..3f5a5121 100644 --- a/demo/exec/segment.cpp +++ b/demo/exec/segment.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION @@ -32,39 +33,28 @@ int main(int argc, const char* argv[]) { MNN_PRINT("Usage: ./segment.out model.mnn input.jpg output.jpg\n"); return 0; } - auto net = Variable::getInputAndOutput(Variable::loadMap(argv[1])); - if (net.first.empty()) { + std::shared_ptr net; + net.reset(Interpreter::createFromFile(argv[1])); + if (net == nullptr) { MNN_ERROR("Invalid Model\n"); return 0; } - auto input = net.first.begin()->second; - auto info = input->getInfo(); - if (nullptr == info) { - MNN_ERROR("The model don't have init dim\n"); - return 0; + ScheduleConfig config; + auto session = net->createSession(config); + auto input = net->getSessionInput(session, nullptr); + auto shape = input->shape(); + if (shape[0] != 1) { + shape[0] = 1; + net->resizeTensor(input, shape); + net->resizeSession(session); } - auto shape = input->getInfo()->dim; - shape[0] = 1; - input->resize(shape); - auto output = net.second.begin()->second; - if (nullptr == output->getInfo()) { - MNN_ERROR("Alloc memory or compute size error\n"); - return 0; - } - { int size_w = 0; int size_h = 0; int bpp = 0; - if (info->order == NHWC) { - bpp = shape[3]; - size_h = shape[1]; - size_w = shape[2]; - } else { - bpp = shape[1]; - size_h = shape[2]; - size_w = shape[3]; - } + bpp = shape[1]; + size_h = shape[2]; + size_w = shape[3]; if (bpp == 0) bpp = 1; if (size_h == 0) @@ -97,47 +87,44 @@ int main(int argc, const char* argv[]) { std::shared_ptr pretreat(ImageProcess::create(config)); pretreat->setMatrix(trans); - pretreat->convert((uint8_t*)inputImage, width, height, 0, input->writeMap(), size_w, size_h, 4, 0, halide_type_of()); + pretreat->convert((uint8_t*)inputImage, width, height, 0, input->host(), size_w, size_h, 4, 0, halide_type_of()); stbi_image_free(inputImage); - input->unMap(); } + // Run model + net->runSession(session); + + // Post treat by MNN-Express { - //auto originOrder = output->getInfo()->order; - output = _Convert(output, NHWC); - //output = _Softmax(output, -1); - auto outputInfo = output->getInfo(); - auto width = outputInfo->dim[2]; - auto height = outputInfo->dim[1]; - auto channel = outputInfo->dim[3]; - std::shared_ptr wrapTensor(ImageProcess::createImageTensor(width, height, 4, nullptr)); - MNN_PRINT("Mask: w=%d, h=%d, c=%d\n", width, height, channel); - auto outputHostPtr = output->readMap(); - for (int y = 0; y < height; ++y) { - auto rgbaY = wrapTensor->host() + 4 * y * width; - auto sourceY = outputHostPtr + y * width * channel; - for (int x=0; x maxValue) { - index = c; - maxValue = sourceX[c]; - } - } - rgba[0] = 255; - rgba[2] = 0; - rgba[1] = 0; - rgba[3] = 255; - if (15 == index) { - rgba[2] = 255; - rgba[3] = 0; - } - } + /* Create VARP by tensor Begin*/ + auto outputTensor = net->getSessionOutput(session, nullptr); + // First Create a Expr, then create Variable by the 0 index of expr + auto output = Variable::create(Expr::create(outputTensor)); + if (nullptr == output->getInfo()) { + MNN_ERROR("Alloc memory or compute size error\n"); + return 0; } - output->unMap(); - stbi_write_png(argv[3], width, height, 4, wrapTensor->host(), 4 * width); + /* Create VARP by tensor End*/ + + // Turn dataFormat to NHWC for easy to run TopKV2 + output = _Convert(output, NHWC); + auto width = output->getInfo()->dim[2]; + auto height = output->getInfo()->dim[1]; + auto channel = output->getInfo()->dim[3]; + MNN_PRINT("output w = %d, h=%d\n", width, height); + + const int humanIndex = 15; + output = _Reshape(output, {-1, channel}); + auto kv = _TopKV2(output, _Scalar(1)); + // Use indice in TopKV2's C axis + auto index = kv[1]; + // If is human, set 255, else set 0 + auto mask = _Select(_Equal(index, _Scalar(humanIndex)), _Scalar(255), _Scalar(0)); + + //If need faster, use this code + //auto mask = _Equal(index, _Scalar(humanIndex)) * _Scalar(255); + + mask = _Cast(mask); + stbi_write_png(argv[3], width, height, 1, mask->readMap(), width); } return 0; } diff --git a/express/Executor.cpp b/express/Executor.cpp index fa8f753d..a8004155 100644 --- a/express/Executor.cpp +++ b/express/Executor.cpp @@ -12,6 +12,7 @@ #include "Utils.hpp" #include #include "core/WrapExecution.hpp" +#include "core/OpCommonUtils.hpp" #include "geometry/GeometryComputerUtils.hpp" #include #ifdef MNN_EXPR_ENABLE_PROFILER @@ -127,10 +128,10 @@ Executor::Requirement Executor::getRequirement(Expr* expr) const { return req; } for (int i = 0; i < inputSize; ++i) { - req.contentNeedContent[i] = SizeComputer::opNeedContent(op->type(), i); + req.contentNeedContent[i] = OpCommonUtils::opNeedContent(op->type(), i); req.shapeNeedContent[i] = false; } - auto needIndexId = SizeComputer::needInputContent(op); + auto needIndexId = SizeComputer::needInputContent(op, inputSize); for (auto index : needIndexId) { if (index < req.shapeNeedContent.size()) { req.shapeNeedContent[index] = true; @@ -440,7 +441,7 @@ ErrorCode Executor::ComputeCache::resize() { op = flatbuffers::GetMutableRoot(cmd.buffer.data()); } for (auto v = 0; vtype(), v)) { + if (!OpCommonUtils::opNeedContent(op->type(), v)) { continue; } auto des = TensorUtils::getDescribe(cmd.inputs[v]); @@ -495,7 +496,7 @@ ErrorCode Executor::ComputeCache::resize() { auto bn = mExecutions[k]->backend(); auto iterType = bn->type(); for (int i=0; itype(), i)) { + if (!OpCommonUtils::opNeedContent(op->type(), i)) { continue; } auto inpDes = TensorUtils::getDescribe(cmd.inputs[i]); @@ -550,7 +551,7 @@ ErrorCode Executor::ComputeCache::resize() { return code; } for (auto v = 0; vtype(), v)) { + if (!OpCommonUtils::opNeedContent(op->type(), v)) { continue; } auto t = cmd.inputs[v]; diff --git a/express/Expr.cpp b/express/Expr.cpp index 96405cf5..64c7cb1a 100644 --- a/express/Expr.cpp +++ b/express/Expr.cpp @@ -99,8 +99,8 @@ Expr::Expr(int outputSize) { mInside.reset(new Inside(outputSize)); mOutputNames.resize(outputSize); } -Expr::Expr(Tensor* tensor) { - mInside.reset(new Inside(tensor)); +Expr::Expr(Tensor* tensor, bool own) { + mInside.reset(new Inside(tensor, own)); mOutputNames.resize(1); } @@ -129,8 +129,8 @@ void Expr::_addLinkForInputs(EXPRP expr) { } } } -EXPRP Expr::create(Tensor* tensor) { - EXPRP expr(new Expr(tensor)); +EXPRP Expr::create(Tensor* tensor, bool own) { + EXPRP expr(new Expr(tensor, own)); expr->mOp = nullptr; expr->mType = VARP::CONSTANT; auto& dstInfo = expr->mInside->mOutputInfos[0]; @@ -566,8 +566,11 @@ void* Variable::readInternal(bool forShape) { auto inside = mFrom->inside(); auto originTensor = inside->mOutputTensors[0]; if (0 != originTensor->buffer().device) { + // For StaticModule will other-device runtime, we may create Variable with other-device's memory + // The case won't occured for varibale = INPUT // Need Copy if (nullptr != inside->mHostTensor) { + // The Varp will not be created as input, so we just need copy once return inside->mHostTensor->host(); } inside->mHostTensor = new Tensor; @@ -838,7 +841,7 @@ void Variable::save(const std::vector& vars, NetT* dest) { auto& info = expr->mInside->mOutputInfos[0]; const void* ptr = expr->mInside->mOutputTensors[0]->host(); VARP temp; - if (nullptr == ptr) { + if (nullptr == ptr || expr->mInside->mOutputTensors[0]->deviceId() > 0) { temp = Variable::create(expr); ptr = temp->readMap(); } diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp index 21b2cde3..7c0111e6 100644 --- a/express/NeuralNetWorkOp.cpp +++ b/express/NeuralNetWorkOp.cpp @@ -392,12 +392,15 @@ output: A variable with the same type as `x`. */ VARP _Reshape(VARP x, VARP shape) { MNN_ASSERT(nullptr != x); - MNN_ASSERT(nullptr != x->getInfo()); std::unique_ptr reshape(new OpT); reshape->type = OpType_Reshape; reshape->main.type = OpParameter_Reshape; reshape->main.value = new ReshapeT; - reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(x->getInfo()->order); + if (nullptr != x->getInfo()) { + reshape->main.AsReshape()->dimType = (MNN_DATA_FORMAT)Utils::convertFormat(x->getInfo()->order); + } else { + reshape->main.AsReshape()->dimType = MNN_DATA_FORMAT_NHWC; + } return (Variable::create(Expr::create(reshape.get(), {x, shape}))); } VARP _Scale(VARP x, int channels, std::vector&& scales, std::vector&& bias) { @@ -425,7 +428,7 @@ VARP _Relu(VARP x, float slope) { relu->main.AsRelu()->slope = slope; return (Variable::create(Expr::create(relu.get(), {x}))); } -/*Given an input value x, it computes Rectified Linear 6: min(max(x, 0), 6). +/*Given an input value x, it computes Rectified Linear 6: min(max(x, 0), 6). Args: x: A variable. Returns: @@ -1562,6 +1565,36 @@ VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim) { return (Variable::create(Expr::create(std::move(cosineSimilarityOp), {input0, input1, inputDim}))); } +VARP _GridSample(VARP input, VARP grid, InterpolationMethod mode, GridSamplePaddingMode paddingMode, bool alignCorners) { + std::unique_ptr op(new OpT); + op->type = OpType_GridSample; + op->main.type = OpParameter_GridSample; + op->main.value = new GridSampleT; + switch (mode) { + case NEAREST: + op->main.AsGridSample()->mode = SampleMode_NEAREST; + break; + case BILINEAR: + default: + op->main.AsGridSample()->mode = SampleMode_BILINEAR; + break; + } + switch (paddingMode) { + case GRID_SAMPLE_PADDING_BORDER: + op->main.AsGridSample()->paddingMode = BorderMode_CLAMP; + break; + case GRID_SAMPLE_PADDING_REFLECTION: + op->main.AsGridSample()->paddingMode = BorderMode_REFLECTION; + break; + case GRID_SAMPLE_PADDING_ZEROS: + default: + op->main.AsGridSample()->paddingMode = BorderMode_ZEROS; + break; + } + op->main.AsGridSample()->alignCorners = alignCorners; + return (Variable::create(Expr::create(std::move(op), {input, grid}))); +} + VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue/*For future*/) { auto xInfo = x->getInfo(); auto scaleInfo = scale->getInfo(); @@ -1574,7 +1607,7 @@ VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue MNN_ERROR("Not Support Input for FloatToInt8 because var not NC4HW4 or not float\n"); return nullptr; } - if (scaleInfo->size != xInfo->dim[1]) { + if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) { MNN_ERROR("Scale's size not match input's channel: %d - %d\n", scaleInfo->size, xInfo->dim[1]); return nullptr; } @@ -1599,7 +1632,7 @@ VARP _FloatToInt8(VARP x, VARP scale, int8_t minValue, int8_t maxValue, int8_t z MNN_ERROR("Not Support Input for FloatToInt8 because var not NC4HW4 or not float\n"); return nullptr; } - if (scaleInfo->size != xInfo->dim[1]) { + if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) { MNN_ERROR("Scale's size not match input's channel: %d - %d\n", scaleInfo->size, xInfo->dim[1]); return nullptr; } @@ -1628,7 +1661,7 @@ VARP _Int8ToFloat(VARP x, VARP scale) { MNN_ERROR("Not Support Input for _Int8ToFloat because var not NC4HW4 or not int8\n"); return nullptr; } - if (scaleInfo->size != xInfo->dim[1]) { + if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) { MNN_ERROR("_Int8ToFloat Scale's size not match input's channel\n"); return nullptr; } @@ -1653,7 +1686,7 @@ VARP _Int8ToFloat(VARP x, VARP scale, int8_t zeroPoint) { MNN_ERROR("Not Support Input for _Int8ToFloat because var not NC4HW4 or not int8\n"); return nullptr; } - if (scaleInfo->size != xInfo->dim[1]) { + if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) { MNN_ERROR("_Int8ToFloat Scale's size not match input's channel\n"); return nullptr; } @@ -1673,5 +1706,16 @@ VARP _Select(VARP select, VARP input0, VARP input1) { return (Variable::create(Expr::create(std::move(selectOp), {select, input0, input1}))); } +std::vector _TopKV2(VARP input0, VARP input1) { + std::unique_ptr op(new OpT); + op->type = OpType_TopKV2; + auto expr = Expr::create(op.get(), {input0, input1}, 2); + std::vector res(2); + res[0] = Variable::create(expr, 0); + res[1] = Variable::create(expr, 1); + return res; +} + + } // namespace Express } // namespace MNN diff --git a/express/Utils.cpp b/express/Utils.cpp index 95aee23a..a0b31ca1 100644 --- a/express/Utils.cpp +++ b/express/Utils.cpp @@ -25,14 +25,13 @@ Expr::Inside::Inside(int outputSize) { TensorUtils::getDescribe(mOutputTensors[i])->memoryType = Tensor::InsideDescribe::MEMORY_HOST; } } -Expr::Inside::Inside(Tensor* tensor) { +Expr::Inside::Inside(Tensor* tensor, bool own) { mOutputInfos.resize(1); mOutputTensors.resize(1); mOutputTensors[0] = tensor; Utils::copyTensorToInfo(&mOutputInfos[0], tensor); mOutputInfos[0].syncSize(); - mOutputInfos[0].tensorArrayAttr = TensorUtils::getDescribe(tensor)->tensorArrayAttr; - mOwnTensor = false; + mOwnTensor = own; } Expr::Inside::~Inside() { diff --git a/express/Utils.hpp b/express/Utils.hpp index 395d1509..72076b78 100644 --- a/express/Utils.hpp +++ b/express/Utils.hpp @@ -29,7 +29,7 @@ struct BufferStorage { }; struct Expr::Inside { Inside(int outputSize); - Inside(Tensor* tensor); + Inside(Tensor* tensor, bool own = false); ~ Inside(); std::vector mOutputInfos; std::vector mOutputTensors; diff --git a/express/module/FixModule.cpp b/express/module/FixModule.cpp deleted file mode 100644 index c6a0cf30..00000000 --- a/express/module/FixModule.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// -// FixModule.cpp -// MNN -// -// Created by MNN on 2019/12/16. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "FixModule.hpp" -#include -using namespace MNN::Express; -namespace MNN { -namespace Express { -FixModule::FixModule(std::vector output, std::vector parameters, - std::vector> inputs) { - for (auto p : parameters) { - addParameter(p); - } - mInputs = std::move(inputs); - mOutput = std::move(output); -} -void FixModule::onClearCache() { - for (auto v : mInputs) { - v.first.fix(VARP::INPUT); - } -} - -std::vector FixModule::onForward(const std::vector& inputs) { - MNN_ASSERT(inputs.size() == mInputs.size()); - for (int i = 0; i < inputs.size(); ++i) { - auto var = inputs[i]; - var = _Convert(var, mInputs[i].second); - Variable::replace(mInputs[i].first, var); - } - return mOutput; -} - -Module* FixModule::clone(CloneContext* ctx) const { - FixModule* module(new FixModule); - for (auto& it : mInputs) { - VARP v = ctx->getOrClone(it.first); - module->mInputs.push_back(std::make_pair(v, it.second)); - } - for (auto& it : mOutput) { - VARP v = ctx->getOrClone(it); - module->mOutput.push_back(v); - } - return this->cloneBaseTo(ctx, module); -} - -} // namespace Express -} // namespace MNN diff --git a/express/module/FixModule.hpp b/express/module/FixModule.hpp deleted file mode 100644 index 59ca5fac..00000000 --- a/express/module/FixModule.hpp +++ /dev/null @@ -1,33 +0,0 @@ -// -// FixModule.hpp -// MNN -// -// Created by MNN on 2019/12/16. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef FixModule_hpp -#define FixModule_hpp -#include -namespace MNN { -namespace Express { - -class FixModule : public Module { -public: - FixModule(std::vector output, std::vector parameters, - std::vector> inputs); - virtual ~FixModule() = default; - virtual std::vector onForward(const std::vector& inputs) override; - virtual void onClearCache() override; -private: - FixModule() = default; - - Module* clone(CloneContext* ctx) const override; - - std::vector> mInputs; - std::vector mOutput; -}; -} // namespace Express -} // namespace MNN - -#endif diff --git a/express/module/Module.cpp b/express/module/Module.cpp index d8df7242..345e3f93 100644 --- a/express/module/Module.cpp +++ b/express/module/Module.cpp @@ -8,7 +8,6 @@ #include #include -#include "FixModule.hpp" #include "PipelineModule.hpp" #include "core/FileLoader.hpp" @@ -124,15 +123,15 @@ Module* Module::load(const std::vector& inputs, const std::vector #include "Distributions.hpp" -#include "FixModule.hpp" +#include "PipelineModule.hpp" #include "WhileModule.hpp" #include "IfModule.hpp" #include "Initializer.hpp" @@ -364,11 +364,11 @@ Module* NN::ConvTranspose(const ConvOption& option, bool hasBias, if (nullptr != bias) { auto tempOutput = _Deconv(weight, bias, input, option.padMode, option.stride, option.dilate, group); tempOutput = _activate(tempOutput, option.fusedActivationFunction); - return new FixModule({tempOutput}, {weight, bias}, {{input, NC4HW4}}); + return PipelineModule::extract({input}, {tempOutput}, true); } auto tempOutput = _Deconv(weight, nullptr, input, option.padMode, option.stride, option.dilate, group); tempOutput = _activate(tempOutput, option.fusedActivationFunction); - return new FixModule({tempOutput}, {weight}, {{input, NC4HW4}}); + return PipelineModule::extract({input}, {tempOutput}, true); } Module* NN::Conv(const ConvOption& option, bool hasBias, std::shared_ptr weightInit, std::shared_ptr biasInit) { @@ -397,12 +397,12 @@ Module* NN::Linear(int l, int t, bool hasBias, std::shared_ptr weig auto input = _Input({l}, NCHW); auto output = _MatMul(input, weight, false, true); if (!hasBias) { - return new FixModule({output}, {weight}, {{input, NCHW}}); + return PipelineModule::extract({input}, {output}, true); } auto bias = biasInit->createConstVar({1, t}, NCHW); bias.fix(VARP::TRAINABLE); output = _Add(output, bias); - auto module = new FixModule({output}, {weight, bias}, {{input, NCHW}}); + auto module = PipelineModule::extract({input}, {output}, true); module->setType("Linear"); return module; } @@ -508,133 +508,10 @@ NN::ConvParameters NN::Utils::ExtractConvolution(EXPRP source) { return _default; } -static int _clamp(int c, int maxValue, int minValue) { - if (c > maxValue) { - return maxValue; - } - if (c < minValue) { - return minValue; - } - return c; -} -class ConvOctaveModule : public Module { -public: - ConvOctaveModule(const NN::ConvOption& option, VARP weight, VARP bias, int group, float inFactor, float outFactor) - : mOption(option) { - auto inputCountC4 = UP_DIV(option.channel[0], 4); - auto outputCountC4 = UP_DIV(option.channel[1], 4); - MNN_ASSERT(inputCountC4 > 1 && outputCountC4 > 1); - MNN_ASSERT(nullptr != bias); - auto iC0 = (int)((float)inputCountC4 * inFactor); - iC0 = _clamp(iC0, inputCountC4 - 1, 1); - - auto oC0 = (int)((float)outputCountC4 * outFactor); - oC0 = _clamp(oC0, outputCountC4 - 1, 1); - - iC0 = iC0 * 4; - auto iC1 = option.channel[0] - iC0; - oC0 = oC0 * 4; - auto oC1 = option.channel[1] - oC0; - mSplitInput = {iC0, iC1}; - - MNN_PRINT("Octave: %d, %d -> %d - %d, %d-%d\n", option.channel[0], option.channel[1], iC0, iC1, oC0, oC1); - auto splitBias = _Split(bias * _Scalar(0.5f), {oC0, oC1}, 0); - mLBias = splitBias[0]; - mHBias = splitBias[1]; - mLBias.fix(VARP::TRAINABLE); - mHBias.fix(VARP::TRAINABLE); - - auto splitWeight = _Split(weight, {oC0, oC1}, 0); - auto lw = _Split(splitWeight[0], {iC0, iC1}, 1); - auto hw = _Split(splitWeight[1], {iC0, iC1}, 1); - mLLW = lw[0]; - mLHW = lw[1]; - mHLW = hw[0]; - mHHW = hw[1]; - - mLLW.fix(VARP::TRAINABLE); - mLHW.fix(VARP::TRAINABLE); - mHLW.fix(VARP::TRAINABLE); - mHHW.fix(VARP::TRAINABLE); - mGroup = group; - addParameter(mLBias); - addParameter(mHBias); - addParameter(mLLW); - addParameter(mLHW); - addParameter(mHHW); - addParameter(mHLW); - setType("ConvOctave"); - } - virtual std::vector onForward(const std::vector& inputs) override { - auto input = _Convert(inputs[0], NC4HW4); - auto inputSplit = _Split(input, mSplitInput, 1); - auto XL = inputSplit[0]; - auto XH = inputSplit[1]; - if (input->getInfo()->dim[3] < 2) { - auto L2L = _Conv(mLLW, mLBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto L2H = _Conv(mHLW, mHBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto H2L = _Conv(mLHW, mLBias, XH, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto H2H = _Conv(mHHW, mHBias, XH, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto L = L2L + H2L; - auto H = H2H + L2H; - return {_Concat({L, H}, 1)}; - } - XL = _AvePool(XL, {2, 2}, {2, 2}); - auto info = XL->getInfo(); - auto L2L = _Conv(mLLW, mLBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto L2H = _Conv(mHLW, mHBias, XL, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto H2L = - _Conv(mLHW, mLBias, _AvePool(XH, {2, 2}, {2, 2}), mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto H2H = _Conv(mHHW, mHBias, XH, mOption.padMode, mOption.stride, mOption.dilate, mGroup); - auto L = L2L + H2L; - auto H = H2H; - auto dstShape = H->getInfo()->dim; // NCHW - { H = H2H + _Interp({L2H}, 0.0f, 0.0f, dstShape[3], dstShape[2], 1, true); } - auto res = _Concat({_Interp({L}, 0.0f, 0.0f, dstShape[3], dstShape[2], 1, true), H}, 1); - info = res->getInfo(); - MNN_ASSERT(nullptr != info); - return {_activate(res, mOption.fusedActivationFunction)}; - } - -private: - ConvOctaveModule() = default; - - Module* clone(CloneContext* ctx) const override { - ConvOctaveModule* module(new ConvOctaveModule); - module->mOption = mOption; - module->mLLW = ctx->getOrClone(mLLW); - module->mLHW = ctx->getOrClone(mLHW); - module->mHLW = ctx->getOrClone(mHLW); - module->mHHW = ctx->getOrClone(mHHW); - module->mLBias = ctx->getOrClone(mLBias); - module->mHBias = ctx->getOrClone(mHBias); - module->mSplitInput = mSplitInput; - module->mGroup = mGroup; - return this->cloneBaseTo(ctx, module); - } - - NN::ConvOption mOption; - VARP mLLW; - VARP mLHW; - VARP mHLW; - VARP mHHW; - VARP mLBias; - VARP mHBias; - - std::vector mSplitInput; - int mGroup; -}; - Module* NN::Conv(const ConvParameters& parameter) { return new ConvModule(parameter); } -Module* NN::ConvOctave(const ConvParameters& parameters, - float inFactor, float outFactor) { - auto module = new ConvOctaveModule(parameters.option, parameters.weight, parameters.bias, parameters.group, inFactor, outFactor); - module->setName(parameters.name); - return module; -} Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::map& subgraphs) { if (nullptr == expr->get()) { return nullptr; @@ -701,46 +578,90 @@ public: mActivation = mOption.fusedActivationFunction; } - mFeatureScaleStatMethod = featureScaleStatMethod; + if (featureScaleStatMethod == NN::PerChannel) { + MNN_PRINT("PerChannel quantization for feature is deprecated, use PerTensor method instead.\n"); + return; + } + + mFeatureScaleStatMethod = NN::PerTensor; mScaleUpdateMethod = scaleUpdateMethod; mBits = bits; - auto limit = (float)(1 << (bits - 1)) - 1.0f; - mLimitScale = _Scalar(1.0f / limit); - mClampValue = _Scalar(limit); + mLimit = (float)(1 << (bits - 1)) - 1.0f; + mLimitScale = _Scalar(1.0f / mLimit); + mWeightClampValue = _Scalar(mLimit); + mInputClampValue = _Scalar(mLimit); + mOutputClampValue = _Scalar(mLimit); - mInputScalePos = addParameter(mInputScale); - mOutputScalePos = addParameter(mOutputScale); + mInputMinPos = addParameter(mInputMin); + mInputMaxPos = addParameter(mInputMax); + mOutputMinPos = addParameter(mOutputMin); + mOutputMaxPos = addParameter(mOutputMax); setType("ConvBNReluFused"); } - std::pair fakeQuantFeature(VARP x, VARP useScale = nullptr) { + std::pair computeScaleAndZeroPoint(VARP min, VARP max, VARP clampVar) { + MNN_ASSERT((!(min == nullptr))); + MNN_ASSERT((!(max == nullptr))); + + min = _Minimum(_Scalar(0.0f), min); + max = _Maximum(_Scalar(0.0f), max); + + auto scale = (max - min) / (_Scalar(2.0f) * clampVar); + auto zeroPoint = _Round((_Scalar(0.0f) - min) / scale - clampVar); + + return std::make_pair(scale, zeroPoint); + } + + std::vector fakeQuantFeatureWithMinMax(VARP x, VARP useMin, VARP useMax, VARP clampVar) { auto originFormat = x->getInfo()->order; auto tempX = x; if (originFormat == NC4HW4) { tempX = _Convert(tempX, NCHW); } auto originX = tempX; - VARP scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar(0.0001f)) * mLimitScale; - if (useScale == nullptr) { - tempX = _Round(tempX * _Reciprocal(scale)) * scale; + VARP min, max; + // always PerTensor + min = _ReduceMin(tempX); + max = _ReduceMax(tempX); + + VARP scale, zeroPoint; + VARP nudgeMin, nudgeMax; + + if (!(useMin == nullptr)) { + MNN_ASSERT(!(useMax == nullptr)); + auto scaleAndZeroPoint = computeScaleAndZeroPoint(useMin, useMax, clampVar); + scale = scaleAndZeroPoint.first; + zeroPoint = scaleAndZeroPoint.second; } else { - tempX = _Round(tempX * _Reciprocal(useScale)) * useScale; + auto scaleAndZeroPoint = computeScaleAndZeroPoint(min, max, clampVar); + scale = scaleAndZeroPoint.first; + zeroPoint = scaleAndZeroPoint.second; } + + float limit = clampVar->readMap()[0]; + nudgeMin = (_Scalar(-limit) - zeroPoint) * scale; + nudgeMax = (_Scalar(limit) - zeroPoint) * scale; + + nudgeMin = _Minimum(_Scalar(0.0f), nudgeMin); + nudgeMax = _Maximum(_Scalar(0.0f), nudgeMax); + + auto quantX = clamp(_Round(tempX / scale + zeroPoint), clampVar); + tempX = scale * (quantX - zeroPoint); // Break the grad by use cast tempX = _Cast(tempX); - // Move grad from tempX to originX tempX = _Convert(tempX + _ZeroGrad(originX), originFormat); - return std::make_pair(tempX, scale); + + return {tempX, nudgeMin, nudgeMax}; } - VARP clamp(VARP x) { - return _Maximum(_Minimum(x, mClampValue), _Negative(mClampValue)); + VARP clamp(VARP x, VARP clampVar) { + return _Maximum(_Minimum(x, clampVar), _Negative(clampVar)); } - VARP updateScale(VARP originValue, VARP newValue) const { + VARP updateParameter(VARP originValue, VARP newValue) const { if (nullptr == originValue) { return newValue; } @@ -761,20 +682,21 @@ public: if (getIsTraining()) { auto x = _Convert(inputs[0], NCHW); // simulate weight quant - auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar(1E-6)) * mLimitScale; - auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale; + auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar(1E-6)) * _Reciprocal(mWeightClampValue); + auto weightTemp = clamp(_Round(mWeight * _Reciprocal(weightScale)), mWeightClampValue) * weightScale; weightTemp = weightTemp + _ZeroGrad(mWeight); // simulate input quant to get original input scale - auto inputPair = fakeQuantFeature(x); - mInputScale = updateScale(mInputScale, inputPair.second); - setParameter(mInputScale, mInputScalePos); + auto inputPair = fakeQuantFeatureWithMinMax(x, nullptr, nullptr, mInputClampValue); + mInputMin = updateParameter(mInputMin, inputPair[1]); + mInputMax = updateParameter(mInputMax, inputPair[2]); + setParameter(mInputMin, mInputMinPos); + setParameter(mInputMax, mInputMaxPos); // simulate output quant to get original output scale - res = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride, + res = _Conv(weightTemp, mBias, _Convert(inputPair[0], NC4HW4), mOption.padMode, mOption.stride, mOption.dilate, mGroup, mOption.pads); res->setName(name()); - auto conv = res; if (mBatchNorm) { res = mBatchNorm->forward(res); @@ -782,25 +704,29 @@ public: res = _activate(res, mActivation); - auto outputPair = fakeQuantFeature(res); - mOutputScale = updateScale(mOutputScale, outputPair.second); - setParameter(mOutputScale, mOutputScalePos); - res = outputPair.first; + auto outputPair = fakeQuantFeatureWithMinMax(res, nullptr, nullptr, mOutputClampValue); + mOutputMin = updateParameter(mOutputMin, outputPair[1]); + mOutputMax = updateParameter(mOutputMax, outputPair[2]); + setParameter(mOutputMin, mOutputMinPos); + setParameter(mOutputMax, mOutputMaxPos); + + res = outputPair[0]; } else { - if (nullptr == mInputScale) { + if (nullptr == mInputMin) { // Initial for test // simulate weight quant - auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar(1E-6)) * mLimitScale; - weightScale.fix(VARP::CONSTANT); - auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale; + auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar(1E-6)) * _Reciprocal(mWeightClampValue); + auto weightTemp = clamp(_Round(mWeight * _Reciprocal(weightScale)), mWeightClampValue) * weightScale; auto x = _Convert(inputs[0], NCHW); - auto inputPair = fakeQuantFeature(x); - mInputScale = inputPair.second; - setParameter(mInputScale, mInputScalePos); - inputPair.first.fix(VARP::CONSTANT); - auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride, + auto inputPair = fakeQuantFeatureWithMinMax(x, nullptr, nullptr, mInputClampValue); + mInputMin = updateParameter(mInputMin, inputPair[1]); + mInputMax = updateParameter(mInputMax, inputPair[2]); + setParameter(mInputMin, mInputMinPos); + setParameter(mInputMax, mInputMaxPos); + + auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair[0], NC4HW4), mOption.padMode, mOption.stride, mOption.dilate, mGroup, mOption.pads); if (mBatchNorm) { simuRes = mBatchNorm->forward(simuRes); @@ -808,10 +734,12 @@ public: simuRes = _activate(simuRes, mActivation); Variable::prepareCompute({simuRes}); - auto outputPair = fakeQuantFeature(simuRes); - mOutputScale = outputPair.second; - setParameter(mOutputScale, mOutputScalePos); - outputPair.first.fix(VARP::CONSTANT); + + auto outputPair = fakeQuantFeatureWithMinMax(simuRes, nullptr, nullptr, mOutputClampValue); + mOutputMin = updateParameter(mOutputMin, outputPair[1]); + mOutputMax = updateParameter(mOutputMax, outputPair[2]); + setParameter(mOutputMin, mOutputMinPos); + setParameter(mOutputMax, mOutputMaxPos); } // fold bn to conv weights and bias @@ -833,21 +761,39 @@ public: alpha = _Reshape(alpha, {alpha->getInfo()->size, 1, 1, 1}); beta = _Reshape(beta, {beta->getInfo()->size, 1, 1, 1}); - alpha.fix(VARP::CONSTANT); - beta.fix(VARP::CONSTANT); fusedWeights = alpha * fusedWeights; fusedBias = alpha * fusedBias + beta; - fusedWeights.fix(VARP::CONSTANT); - fusedBias.fix(VARP::CONSTANT); } auto x = _Convert(inputs[0], NC4HW4); + + int8_t inputZeroPoint, outputZeroPoint; { - std::vector dims = {x->getInfo()->dim[1]}; - auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of()); - VARP channelScale = _Reciprocal(_Fill(dimVar, mInputScale)); - x = _FloatToInt8(x, channelScale, -127, 127);// TODO add clamp + VARP channelScale, zeroPoint; + auto scaleAndZeroPoint = computeScaleAndZeroPoint(mInputMin, mInputMax, mInputClampValue); + mInputScale = scaleAndZeroPoint.first; + mInputZeroPoint = scaleAndZeroPoint.second; + + // always PerTensor + channelScale = _Reciprocal(mInputScale); + zeroPoint = _Cast(mInputZeroPoint); + + inputZeroPoint = zeroPoint->readMap()[0]; + + x = _FloatToInt8(x, channelScale, -int8_t(mInputClampValue->readMap()[0]), int8_t(mInputClampValue->readMap()[0]), inputZeroPoint); + } + { + VARP channelScale, zeroPoint; + auto scaleAndZeroPoint = computeScaleAndZeroPoint(mOutputMin, mOutputMax, mOutputClampValue); + mOutputScale = scaleAndZeroPoint.first; + mOutputZeroPoint = scaleAndZeroPoint.second; + + // always PerTensor + channelScale = mOutputScale; + zeroPoint = _Cast(mOutputZeroPoint); + + outputZeroPoint = zeroPoint->readMap()[0]; } std::vector weight; @@ -855,19 +801,18 @@ public: std::vector scale; { VARP weightScale, quanWeight, convScale; - if (mOption.depthwise) { - auto newWeight = fusedWeights * _Reshape(mInputScale, {-1, 1, 1, 1}); - weightScale = _Maximum(_ReduceMax(_Abs(newWeight), {1, 2, 3}, true), _Scalar(1E-6)) * mLimitScale; - quanWeight = _Cast(_Round(newWeight * _Reciprocal(weightScale))); - convScale = _Reshape(_Reciprocal(mOutputScale), {-1, 1, 1, 1}) * weightScale; - } else { - auto newWeight = fusedWeights * mInputScale; - weightScale = _Maximum(_ReduceMax(_Abs(newWeight), {1, 2, 3}, true), _Scalar(1E-6)) * mLimitScale; - quanWeight = _Cast(_Round(newWeight * _Reciprocal(weightScale))); - convScale = _Reshape(_Reciprocal(mOutputScale), {-1, 1, 1, 1}) * weightScale; - } - auto quanBias = _Cast(fusedBias * _Reciprocal(weightScale)); - Variable::prepareCompute({quanBias, quanWeight, convScale}); + auto newWeight = fusedWeights * mInputScale; + weightScale = _Maximum(_ReduceMax(_Abs(newWeight), {1, 2, 3}, true), _Scalar(1E-6)) * mLimitScale; + quanWeight = _Cast(_Round(newWeight * _Reciprocal(weightScale))); + convScale = _Reciprocal(mOutputScale) * weightScale; + Variable::prepareCompute({quanWeight, convScale}); + + auto remains = _ReduceSum(_Cast(mInputZeroPoint) * _Cast(quanWeight), {1, 2, 3}, true); + MNN_ASSERT((mOutputZeroPoint->getInfo()->dim.size() == 0) && (mOutputZeroPoint->getInfo()->size == 1)); // only support per-tensor, per-channel is removed. + auto outputZeroPointFused = _Cast(_Cast(mOutputZeroPoint) * _Reciprocal(convScale)); + auto quanBias = _Cast(fusedBias * _Reciprocal(weightScale)) - remains + outputZeroPointFused; + Variable::prepareCompute({quanBias}); + { auto info = quanWeight->getInfo(); weight.resize(info->size); @@ -888,14 +833,13 @@ public: } bool relu = mActivation == NN::None ? false : true; res = _Conv(std::move(weight), std::move(bias), std::move(scale), _Convert(x, NC4HW4), mOption.channel, - mOption.kernelSize, mOption.padMode, mOption.stride, mOption.dilate, mGroup, mOption.pads, relu, 0, 0, -int8_t(mClampValue->readMap()[0]), int8_t(mClampValue->readMap()[0]), false); + mOption.kernelSize, mOption.padMode, mOption.stride, mOption.dilate, mGroup, mOption.pads, relu, + inputZeroPoint, outputZeroPoint, + -int8_t(mOutputClampValue->readMap()[0]), int8_t(mOutputClampValue->readMap()[0]), mAccumulateToInt16); res->setName(name()); - { - std::vector dims = {res->getInfo()->dim[1]}; - auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of()); - VARP channelScale = _Fill(dimVar, mOutputScale); - res = _Int8ToFloat(res, channelScale); - } + + // always PerTensor + res = _Int8ToFloat(res, mOutputScale, outputZeroPoint); } return {res}; @@ -915,12 +859,23 @@ private: module->mBias = ctx->getOrClone(mBias); module->mActivation = mActivation; module->mBits = mBits; + module->mLimit = mLimit; module->mLimitScale = ctx->getOrClone(mLimitScale); - module->mInputScalePos = mInputScalePos; - module->mOutputScalePos = mOutputScalePos; + module->mWeightClampValue = ctx->getOrClone(mWeightClampValue); module->mInputScale = ctx->getOrClone(mInputScale); module->mOutputScale = ctx->getOrClone(mOutputScale); - module->mClampValue = ctx->getOrClone(mClampValue); + module->mInputMin = ctx->getOrClone(mInputMin); + module->mInputMax = ctx->getOrClone(mInputMax); + module->mOutputMin = ctx->getOrClone(mOutputMin); + module->mOutputMax = ctx->getOrClone(mOutputMax); + module->mInputZeroPoint = ctx->getOrClone(mInputZeroPoint); + module->mOutputZeroPoint = ctx->getOrClone(mOutputZeroPoint); + module->mInputMinPos = mInputMinPos; + module->mInputMaxPos = mInputMaxPos; + module->mOutputMinPos = mOutputMinPos; + module->mOutputMaxPos = mOutputMaxPos; + module->mInputClampValue = ctx->getOrClone(mInputClampValue); + module->mOutputClampValue = ctx->getOrClone(mOutputClampValue); module->mMomentum = mMomentum; module->mFeatureScaleStatMethod = mFeatureScaleStatMethod; module->mScaleUpdateMethod = mScaleUpdateMethod; @@ -939,15 +894,27 @@ private: NN::ActivationFunctionType mActivation = NN::ActivationFunctionType::None; std::shared_ptr mBatchNorm = nullptr; int mBits; + float mLimit; VARP mLimitScale; - int mInputScalePos = -1; - int mOutputScalePos = -1; + Express::VARP mWeightClampValue; VARP mInputScale = nullptr; VARP mOutputScale = nullptr; - VARP mClampValue; + VARP mInputMin = nullptr; + VARP mInputMax = nullptr; + VARP mOutputMin = nullptr; + VARP mOutputMax = nullptr; + VARP mInputZeroPoint = nullptr; + VARP mOutputZeroPoint = nullptr; + int mInputMinPos = -1; + int mInputMaxPos = -1; + int mOutputMinPos = -1; + int mOutputMaxPos = -1; + VARP mInputClampValue; + VARP mOutputClampValue; float mMomentum = 0.99f; NN::FeatureScaleStatMethod mFeatureScaleStatMethod; NN::ScaleUpdateMethod mScaleUpdateMethod; + bool mAccumulateToInt16 = false; }; Module* NN::ConvBNReluFused(std::vector > modules, @@ -967,4 +934,4 @@ Module* NN::ConvInt8(const ConvParameters& para, int bits, NN::FeatureScaleStatM } } // namespace Express -} // namespace MNN \ No newline at end of file +} // namespace MNN diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp index ed1b9ad7..ac329eda 100644 --- a/express/module/PipelineModule.cpp +++ b/express/module/PipelineModule.cpp @@ -425,6 +425,7 @@ void PipelineModule::_createSubGraph(const MNN::Net* net, const Module::Config* std::unique_ptr _tempNet(new NetT); _tempNet->oplists = std::move(_tempInfo->nodes); _tempNet->tensorName = std::move(_tempInfo->tensors); + _tempNet->extraTensorDescribe = std::move(_tempInfo->extraTensorDescribe); flatbuffers::FlatBufferBuilder builder(1024); auto offset = Net::Pack(builder, _tempNet.get()); builder.Finish(offset); @@ -598,6 +599,13 @@ static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, for (int i=0; itensorName()->size(); ++i) { _tempNet->tensorName[i] = net->tensorName()->GetAsString(i)->str(); } + // Copy Tensor Describe for quant model + if (net->extraTensorDescribe()) { + _tempNet->extraTensorDescribe.resize(net->extraTensorDescribe()->size()); + for (int i=0; iextraTensorDescribe()->size(); ++i) { + _tempNet->extraTensorDescribe[i].reset(net->extraTensorDescribe()->Get(i)->UnPack()); + } + } // Create Input node std::vector inputNames; for (auto index : info.inputs) { @@ -727,6 +735,12 @@ Module* PipelineModule::load(const std::vector& inputs, const std:: // Make Stack, first: origin, second: new std::map stackMap; int stackIndex = 0; + for (auto index : inputIndexesVec) { + if (stackMap.find(index) == stackMap.end()) { + stackMap.insert(std::make_pair(index, stackIndex)); + stackIndex++; + } + } for (auto& m : subModulesInfo) { for (auto index : m.inputs) { if (stackMap.find(index) == stackMap.end()) { @@ -742,6 +756,7 @@ Module* PipelineModule::load(const std::vector& inputs, const std:: } } result->mStackSize = stackMap.size(); + MNN_ASSERT(result->mStackSize > 0); for (int i=0; i #include "Utils.hpp" #include "core/MNNMemoryUtils.h" -#include "core/Schedule.hpp" #include "core/Session.hpp" #include "core/TensorUtils.hpp" @@ -24,15 +23,60 @@ static std::shared_ptr preRearrangeWeights( // NOLINT const MNN::Net* net, std::map>& cache, Backend* backend) { std::unique_ptr net_table(net->UnPack()); std::map> exeCache; + bool isQuantModel = !net_table->extraTensorDescribe.empty(); + std::vector quantInfos; + std::vector> inputTensors; + if (isQuantModel) { + quantInfos.resize(net_table->tensorName.size(), nullptr); + for (auto& tensorDes : net_table->extraTensorDescribe) { + quantInfos[tensorDes->index] = tensorDes->quantInfo.get(); + } + } for (int i = 0; i < net->oplists()->size(); ++i) { auto op = net->oplists()->Get(i); auto op_table = net_table->oplists[i].get(); + if (op->inputIndexes() == nullptr || op->inputIndexes()->size() != 1) { + continue; + } switch (op->type()) { case MNN::OpType_DepthwiseConvInt8: case MNN::OpType_ConvInt8: case MNN::OpType_ConvolutionDepthwise: case MNN::OpType_Convolution: { - std::shared_ptr exe(backend->onCreate({}, {}, op)); + std::shared_ptr exe; + if (isQuantModel) { + int inputIdx = op->inputIndexes()->Get(0); + auto inputTensor = Tensor::create({1}, halide_type_of()); + inputTensors.emplace_back(inputTensor); + auto& inputQuantAttr = TensorUtils::getDescribe(inputTensor)->quantAttr; + if (quantInfos[inputIdx]) { + inputQuantAttr.reset(new QuantAttr); + inputQuantAttr->scale = quantInfos[inputIdx]->scale; + inputQuantAttr->min = quantInfos[inputIdx]->min; + inputQuantAttr->max = quantInfos[inputIdx]->max; + inputQuantAttr->zero = quantInfos[inputIdx]->zero; + } else { + inputQuantAttr.reset(); + } + int outputIdx = op->inputIndexes()->Get(0); + auto outputTensor = Tensor::create({1}, halide_type_of()); + inputTensors.emplace_back(outputTensor); + auto& outputQuantAttr = TensorUtils::getDescribe(outputTensor)->quantAttr; + if (quantInfos[outputIdx]) { + outputQuantAttr.reset(new QuantAttr); + outputQuantAttr->scale = quantInfos[outputIdx]->scale; + outputQuantAttr->min = quantInfos[outputIdx]->min; + outputQuantAttr->max = quantInfos[outputIdx]->max; + outputQuantAttr->zero = quantInfos[outputIdx]->zero; + } else { + outputQuantAttr.reset(); + } + if (inputQuantAttr && outputQuantAttr && op->main_as_Convolution2D()->quanParameter()) { + exe.reset(backend->onCreate({inputTensor}, {outputTensor}, op)); + } + } else { + exe.reset(backend->onCreate({}, {}, op)); + } if (nullptr == exe) { break; } @@ -70,9 +114,6 @@ static std::shared_ptr preRearrangeWeights( // NOLINT auto op = net->oplists()->Get(iter.first); cache.insert(std::make_pair(op, iter.second)); } - for (int i = 0; i < net->oplists()->size(); ++i) { - auto op = net->oplists()->Get(i); - } return net_storage; } @@ -129,18 +170,47 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector< if (mResource->mOutputFromTensor.empty()) { return; } - auto rt = Express::ExecutorScope::Current()->getRuntime(); + + RuntimeInfo rt; + if (moduleconfig.backend == nullptr) { + rt = Express::ExecutorScope::Current()->getRuntime(); + } else { + ScheduleConfig sche_config; + sche_config.type = moduleconfig.backend->type; + sche_config.backendConfig = moduleconfig.backend->config; + rt = Interpreter::createRuntime(std::vector({sche_config})); + } // TODO: Add Config - ScheduleConfig config; - config.numThread = 1; - config.type = rt.first.begin()->first; - config.saveTensors = outputs; - auto scheduleInfo = Schedule::schedule(GetNet(buffer), {config}); + mResource->mConfig.numThread = 1; + mResource->mConfig.type = rt.first.begin()->first; + mResource->mConfig.path.mode = ScheduleConfig::Path::Mode::Tensor; + mResource->mConfig.path.outputs = outputs; + mResource->mConfig.saveTensors = outputs; + mResource->mConfig.path.inputs = inputs; + auto scheduleInfo = Schedule::schedule(GetNet(buffer), {mResource->mConfig}); #ifdef MNN_EXPR_ENABLE_PROFILER Interpreter::SessionMode callBackMode = Interpreter::Session_Debug; #else Interpreter::SessionMode callBackMode = Interpreter::Session_Release; #endif + auto isUsedContent = [&scheduleInfo](const Tensor* t) { + const auto& infos = scheduleInfo.pipelineInfo[0].second; + for (auto info : infos) { + auto needInputs = SizeComputer::needInputContent(info.op, info.inputs.size()); + for (auto inputIdx : needInputs) { + if (inputIdx < info.inputs.size() && info.inputs[inputIdx] == t) { + return true; + } + } + } + return false; + }; + std::set useContentInputs; + for (const auto& iter : scheduleInfo.inputTensors) { + if (isUsedContent(iter.second)) { + useContentInputs.insert(iter.second); + } + } Interpreter::SessionMode inputMode = mResource->mShapeFix ? Interpreter::Session_Input_Inside : Interpreter::Session_Input_User; mSession.reset(new Session(std::move(scheduleInfo), callBackMode, inputMode, std::move(rt))); @@ -151,6 +221,9 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector< mInputTensors.resize(inputs.size()); for (int i = 0; i < inputs.size(); ++i) { mInputTensors[i] = mSession->getInput(inputs[i].c_str()); + if (useContentInputs.find(mInputTensors[i]) != useContentInputs.end()) { + mResource->mUseContentInputs.insert(i); + } } mOutputTensors.resize(mResource->mOutputFromTensor.size()); for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) { @@ -177,22 +250,18 @@ std::vector StaticModule::onForward(const std::vectorgetInfo(); - mInputTensors[i]->buffer().type = info->type; - auto des = TensorUtils::getDescribe(mInputTensors[i]); - if (info->order == Express::NCHW) { - des->dimensionFormat = MNN_DATA_FORMAT_NCHW; + auto exprInfo = inputs[i]->expr(); + auto inside = exprInfo.first->inside(); + auto inputTensor = inside->mOutputTensors[exprInfo.second]; + if (nullptr != inside->mCache) { + inputTensor = Executor::getOutput(inside->mCache.get(), inside->mCacheOffset); } - if (info->order == Express::NHWC) { - des->dimensionFormat = MNN_DATA_FORMAT_NHWC; - } - if (info->order == Express::NC4HW4) { - des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4; - } - if (info->tensorArrayAttr != nullptr) { - des->tensorArrayAttr = info->tensorArrayAttr; - } - resizeTensor(mInputTensors[i], info->dim); + auto srcDes = TensorUtils::getDescribe(inputTensor); + auto des = TensorUtils::getDescribe(mInputTensors[i]); + des->dimensionFormat = srcDes->dimensionFormat; + des->tensorArrayAttr = srcDes->tensorArrayAttr; + mInputTensors[i]->buffer().type = inputTensor->buffer().type; + resizeTensor(mInputTensors[i], inputTensor->shape()); } if (!mResource->mShapeFix) { for (int i = 0; i < inputs.size(); ++i) { @@ -202,13 +271,14 @@ std::vector StaticModule::onForward(const std::vectorreadMap(); if (srcPtr != mInputTensors[i]->buffer().host) { mInputTensors[i]->buffer().host = srcPtr; - mSession->setNeedResize(); + mSession->setNeedMalloc(); + if (mResource->mUseContentInputs.find(i) != mResource->mUseContentInputs.end()) { + mSession->setNeedResize(); + } } } } - if (mSession->getNeedResize()) { - mSession->resize(); - } + mSession->resize(); if (mResource->mShapeFix) { for (int i = 0; i < inputs.size(); ++i) { if (nullptr == mInputTensors[i]) { @@ -247,34 +317,22 @@ std::vector StaticModule::onForward(const std::vectorquantAttr; + bool isQuant = (quantAttr && TensorUtils::DataTypeToHalideType(quantAttr->type) == currentTensor->getType()); // copy the data when reused as input tensor with data; - if (currentTensor->elementSize() > 0 && (mResource->mReusedTensors.find(mResource->mOutputFromTensor[i]) != mResource->mReusedTensors.end() || mResource->mCopyOutput)) { - std::shared_ptr tmpTensor(new Tensor(currentTensor, currentTensor->getDimensionType(), false)); + if (currentTensor->elementSize() > 0 && (mResource->mReusedTensors.find(mResource->mOutputFromTensor[i]) != mResource->mReusedTensors.end() || mResource->mCopyOutput || isQuant)) { + auto tmpTensor = new Tensor(currentTensor, currentTensor->getDimensionType(), false); tmpTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(tmpTensor->size(), MNN_MEMORY_ALIGN_DEFAULT); auto des = TensorUtils::getDescribe(mOutputTensors[i]); if (nullptr != des->backend) { - currentTensor->copyToHostTensor(tmpTensor.get()); + currentTensor->copyToHostTensor(tmpTensor); } else { - MNNCPUCopyBuffer(currentTensor, tmpTensor.get()); - } - Express::Variable::Info info; - info.dim = tmpTensor->shape(); - info.type = tmpTensor->getType(); - auto format = des->dimensionFormat; - info.order = Express::NHWC; - if (format == MNN_DATA_FORMAT_NCHW) { - info.order = Express::NCHW; - } else if (format == MNN_DATA_FORMAT_NC4HW4) { - info.order = Express::NC4HW4; - } - // if this output tensor is TensorArray, copy attr - if (des->tensorArrayAttr != nullptr) { - info.tensorArrayAttr = des->tensorArrayAttr; + MNNCPUCopyBuffer(currentTensor, tmpTensor); } + TensorUtils::getDescribe(tmpTensor)->dimensionFormat = des->dimensionFormat; + TensorUtils::getDescribe(tmpTensor)->tensorArrayAttr = des->tensorArrayAttr; outputs[mResource->mOutputFromTensor[i]] = - Express::Variable::create(Express::Expr::create(std::move(info), tmpTensor->host(), - Express::VARP::CONSTANT, Expr::MemoryType::MOVE), - 0); + Express::Variable::create(Express::Expr::create(tmpTensor, true), 0); } else { outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(mOutputTensors[i])); } @@ -293,11 +351,7 @@ Module* StaticModule::clone(CloneContext* ctx) const { return this->cloneBaseTo(ctx, module); } auto rt = Express::ExecutorScope::Current()->getRuntime(); - ScheduleConfig config; - config.numThread = 1; - config.type = rt.first.begin()->first; - config.saveTensors = mResource->mOutputs; - auto scheduleInfo = Schedule::schedule(GetNet(mResource->mNetStorage->buffer()), {config}); + auto scheduleInfo = Schedule::schedule(GetNet(mResource->mNetStorage->buffer()), {mResource->mConfig}); #ifdef MNN_EXPR_ENABLE_PROFILER Interpreter::SessionMode callBackMode = Interpreter::Session_Debug; #else diff --git a/express/module/StaticModule.hpp b/express/module/StaticModule.hpp index 24b45669..86d96c1c 100644 --- a/express/module/StaticModule.hpp +++ b/express/module/StaticModule.hpp @@ -11,6 +11,8 @@ #include #include +#include "core/Schedule.hpp" + namespace MNN { class Session; class Backend; @@ -40,8 +42,10 @@ private: std::vector> mOutputFromInput; // the outputs will be used as inputs std::set mReusedTensors; + std::set mUseContentInputs; std::shared_ptr mNetStorage; bool mCopyOutput = false; + ScheduleConfig mConfig; }; std::shared_ptr mSession; std::vector mInputTensors; diff --git a/include/MNN/ImageProcess.hpp b/include/MNN/ImageProcess.hpp index 7fbd5981..d5c6c748 100644 --- a/include/MNN/ImageProcess.hpp +++ b/include/MNN/ImageProcess.hpp @@ -133,11 +133,20 @@ public: } static Tensor* createImageTensor(halide_type_t type, int w, int h, int bpp, void* p = nullptr); + /** + * @brief set padding value when wrap=ZERO. + * @param value padding value. + * @return void. + */ + void setPadding(uint8_t value) { + mPaddingValue = value; + } private: ImageProcess(const Config& config); Matrix mTransform; Matrix mTransformInvert; Inside* mInside; + uint8_t mPaddingValue = 0; }; } // namespace CV } // namespace MNN diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp index 49a19156..5a358a4c 100644 --- a/include/MNN/Interpreter.hpp +++ b/include/MNN/Interpreter.hpp @@ -47,7 +47,7 @@ struct ScheduleConfig { Op = 0, /** - * Tensor Mode (NOT supported yet) + * Tensor Mode * - inputs means the inputs tensors, can NOT be empty. * - outputs means the outputs tensors, can NOT be empty. * It will find the pipeline that compute outputs from inputs. diff --git a/include/MNN/expr/Expr.hpp b/include/MNN/expr/Expr.hpp index 849a2053..094819af 100644 --- a/include/MNN/expr/Expr.hpp +++ b/include/MNN/expr/Expr.hpp @@ -22,7 +22,6 @@ struct OpT; struct Op; struct NetT; class Tensor; -struct TensorArrayAttr; namespace Express { class Variable; class Expr; @@ -110,7 +109,6 @@ public: halide_type_t type; int size; void syncSize(); - std::shared_ptr tensorArrayAttr; }; const std::string& name() const; void setName(const std::string& name); @@ -181,7 +179,7 @@ public: MOVE, REF }; - static EXPRP create(Tensor* tensor); + static EXPRP create(Tensor* tensor, bool own = false); static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, MemoryType copy = COPY); static EXPRP create(const OpT* op, std::vector inputs, int outputSize = 1); @@ -240,7 +238,7 @@ private: static void _addLinkForInputs(EXPRP expr); Expr(int outputSize); - Expr(Tensor* tensor); + Expr(Tensor* tensor, bool own = false); friend class Variable; friend class VARP; diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp index d008ecf4..adb4e535 100644 --- a/include/MNN/expr/Module.hpp +++ b/include/MNN/expr/Module.hpp @@ -13,6 +13,7 @@ #include #include +#include namespace MNN { namespace Express { @@ -47,6 +48,11 @@ public: void setParameter(Express::VARP parameter, int index); static Module* createEmpty(const std::vector& parameters); + struct BackendInfo { + MNNForwardType type = MNN_FORWARD_CPU; + BackendConfig* config = nullptr; + }; + struct Config { // Load module as dynamic, default static bool dynamic = false; @@ -57,6 +63,8 @@ public: // The weights will be rearranged in a general way, so the best implementation // may not be adopted if `rearrange` is enabled. bool rearrange = false; + + BackendInfo* backend = nullptr; }; static Module* load(const std::vector& inputs, const std::vector& outputs, const uint8_t* buffer, size_t length, const Config* config = nullptr); static Module* load(const std::vector& inputs, const std::vector& outputs, const char* fileName, const Config* config = nullptr); diff --git a/include/MNN/expr/NN.hpp b/include/MNN/expr/NN.hpp index d3364067..be4338e8 100644 --- a/include/MNN/expr/NN.hpp +++ b/include/MNN/expr/NN.hpp @@ -73,7 +73,6 @@ public: static Module* ConvInt8(const ConvParameters& parameters, int bits, FeatureScaleStatMethod featureMethod = PerChannel, ScaleUpdateMethod method = MovingAverage); - static Module* ConvOctave(const ConvParameters& parameters, float inFactor, float outFactor); static Module* Conv(const ConvParameters& parameters); static Module* ConvBNReluFused(std::vector > modules, NN::FeatureScaleStatMethod featureScaleStatMethod = PerTensor, diff --git a/include/MNN/expr/NeuralNetWorkOp.hpp b/include/MNN/expr/NeuralNetWorkOp.hpp index 53edb6b6..77d1dc1e 100644 --- a/include/MNN/expr/NeuralNetWorkOp.hpp +++ b/include/MNN/expr/NeuralNetWorkOp.hpp @@ -136,12 +136,16 @@ MNN_PUBLIC VARP _Conv(std::vector&& weight, std::vector&& bias, std int8_t inputZeroPoint, int8_t outputZeroPoint, int8_t minValue, int8_t maxValue, bool accumulateToInt16); MNN_PUBLIC VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim); + +enum GridSamplePaddingMode {GRID_SAMPLE_PADDING_ZEROS, GRID_SAMPLE_PADDING_BORDER, GRID_SAMPLE_PADDING_REFLECTION}; +MNN_PUBLIC VARP _GridSample(VARP input, VARP grid, InterpolationMethod mode=BILINEAR, GridSamplePaddingMode paddingMode=GRID_SAMPLE_PADDING_ZEROS, bool alignCorners=false); MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, char minValue, char maxValue); MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, int8_t minValue, int8_t maxValue, int8_t zeroPoint); MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale); MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale, int8_t zeroPoint); MNN_PUBLIC VARP _Select(VARP select, VARP input0, VARP input1); +MNN_PUBLIC std::vector _TopKV2(VARP input0, VARP input1); } // namespace Express } // namespace MNN diff --git a/package_scripts/linux/build_tools.sh b/package_scripts/linux/build_tools.sh index 8ab5f4bc..e9d9a8c0 100644 --- a/package_scripts/linux/build_tools.sh +++ b/package_scripts/linux/build_tools.sh @@ -29,7 +29,7 @@ rm -rf build && mkdir build pushd build [ -f CMakeCache.txt ] && rm CMakeCache.txt -cmake $CMAKE_ARGS .. && make -j8 +cmake $CMAKE_ARGS .. && make -j24 cp *.out $TOOLS_PATH popd diff --git a/package_scripts/linux/build_whl.sh b/package_scripts/linux/build_whl.sh index 80829fd5..1157cb48 100755 --- a/package_scripts/linux/build_whl.sh +++ b/package_scripts/linux/build_whl.sh @@ -31,6 +31,7 @@ cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24 popd pushd pymnn/pip_package +echo -e "__version__ = '$mnn_version'" > MNN/version.py rm -rf build && mkdir build rm -rf dist && mkdir dist rm -rf wheelhouse && mkdir wheelhouse @@ -46,5 +47,5 @@ for whl in dist/*.whl; do auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse done cp wheelhouse/* $PACKAGE_PATH - +rm MNN/version.py popd diff --git a/package_scripts/mac/build_whl.sh b/package_scripts/mac/build_whl.sh index 79b2db4e..a24fe93b 100755 --- a/package_scripts/mac/build_whl.sh +++ b/package_scripts/mac/build_whl.sh @@ -34,6 +34,7 @@ cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8 popd pushd pymnn/pip_package +echo -e "__version__ = '$mnn_version'" > MNN/version.py rm -rf build && mkdir build rm -rf dist && mkdir dist for env in $python_versions; do @@ -41,5 +42,5 @@ for env in $python_versions; do python build_wheel.py --version $mnn_version done cp dist/* $PACKAGE_PATH - +rm MNN/version.py popd diff --git a/package_scripts/win/build_bridge.ps1 b/package_scripts/win/build_bridge.ps1 index 07f19505..17db0d61 100644 --- a/package_scripts/win/build_bridge.ps1 +++ b/package_scripts/win/build_bridge.ps1 @@ -10,6 +10,7 @@ # |--- Static Param( + [Parameter(Mandatory=$true)][String]$version, [Parameter(Mandatory=$true)][String]$pyc_env, [Parameter(Mandatory=$true)][String]$mnn_path, [Parameter(Mandatory=$true)][String]$path, @@ -62,6 +63,7 @@ popd pyenv global $pyc_env python -c "import compileall; compileall.compile_dir('./pymnn_pyc_tmp', force=True)" Remove-Item .\pymnn_pyc_tmp -Include *.py -Recurse +Set-Content -Path pymnn_pyc_tmp\version.py -Value "__version__ = '$version'" cp -r .\pymnn_pyc_tmp\* $PACKAGE_PATH\wrapper -Force rm -r -force pymnn_pyc_tmp diff --git a/package_scripts/win/build_lib.ps1 b/package_scripts/win/build_lib.ps1 index 14288cc6..95cf19da 100644 --- a/package_scripts/win/build_lib.ps1 +++ b/package_scripts/win/build_lib.ps1 @@ -34,7 +34,7 @@ if ($opencl) { $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON" } -Remove-Item build -Recurse -ErrorAction Ignore +#Remove-Item build -Recurse -ErrorAction Ignore mkdir build pushd build diff --git a/package_scripts/win/build_whl.ps1 b/package_scripts/win/build_whl.ps1 index 0af13127..815bec1e 100644 --- a/package_scripts/win/build_whl.ps1 +++ b/package_scripts/win/build_whl.ps1 @@ -31,6 +31,7 @@ ninja MNN MNNTrain MNNConvert popd pushd pymnn/pip_package +Set-Content -Path MNN/version.py -Value "__version__ = '$version'" Remove-Item dist -Recurse -ErrorAction Ignore Remove-Item build -Recurse -ErrorAction Ignore mkdir dist @@ -41,4 +42,5 @@ Foreach ($env in $python_versions) { Invoke-Expression "python build_wheel.py $ARGS" } cp dist/* $PACKAGE_PATH +Remove-Item MNN/version.py -ErrorAction Ignore popd \ No newline at end of file diff --git a/project/android/build_32.sh b/project/android/build_32.sh index 635af366..1c516c21 100755 --- a/project/android/build_32.sh +++ b/project/android/build_32.sh @@ -8,6 +8,9 @@ cmake ../../../ \ -DANDROID_NATIVE_API_LEVEL=android-14 \ -DANDROID_TOOLCHAIN=clang \ -DMNN_USE_LOGCAT=false \ +-DMNN_USE_SSE=OFF \ +-DMNN_SUPPORT_BF16=OFF \ +-DMNN_BUILD_TEST=ON \ -DMNN_BUILD_FOR_ANDROID_COMMAND=true \ -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3 diff --git a/project/android/build_32_arm82.sh b/project/android/build_32_arm82.sh new file mode 100755 index 00000000..b19183a8 --- /dev/null +++ b/project/android/build_32_arm82.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Release compile work until ndk-r21e (clang 9.0.9svn), Debug compile work until ndk-r22 (clang 11.0.5) +# https://github.com/android/ndk/wiki/Changelog-r22#changes Issues 1303 +# https://github.com/android/ndk/wiki/Changelog-r21#r21e Issues 1248 +# export ANDROID_NDK=/path/to/ndk-r21e + +cmake ../../../ \ +-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ +-DCMAKE_BUILD_TYPE=Release \ +-DANDROID_ABI="armeabi-v7a" \ +-DANDROID_STL=c++_static \ +-DANDROID_NATIVE_API_LEVEL=android-18 \ +-DANDROID_TOOLCHAIN=clang \ +-DMNN_USE_LOGCAT=false \ +-DMNN_USE_SSE=OFF \ +-DMNN_SUPPORT_BF16=OFF \ +-DMNN_BUILD_TEST=ON \ +-DMNN_BUILD_FOR_ANDROID_COMMAND=true \ +-DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3 \ +-DMNN_ARM82=ON \ +-DMNN_BUILD_BENCHMARK=ON + +make -j8 diff --git a/project/android/build_64.sh b/project/android/build_64.sh index 6c053941..e717ee68 100755 --- a/project/android/build_64.sh +++ b/project/android/build_64.sh @@ -6,6 +6,9 @@ cmake ../../../ \ -DANDROID_STL=c++_static \ -DMNN_USE_LOGCAT=false \ -DMNN_BUILD_BENCHMARK=ON \ +-DMNN_USE_SSE=OFF \ +-DMNN_SUPPORT_BF16=OFF \ +-DMNN_BUILD_TEST=ON \ -DANDROID_NATIVE_API_LEVEL=android-21 \ -DMNN_BUILD_FOR_ANDROID_COMMAND=true \ -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3 diff --git a/project/android/updateTest.sh b/project/android/updateTest.sh index 02568434..0ed9ff67 100755 --- a/project/android/updateTest.sh +++ b/project/android/updateTest.sh @@ -1,20 +1,22 @@ #!/bin/bash -make -j16 -adb push ./libMNN.so /data/local/tmp/MNN/libMNN.so -adb push ./libMNN_CL.so /data/local/tmp/MNN/libMNN_CL.so -adb push ./libMNN_Vulkan.so /data/local/tmp/MNN/libMNN_Vulkan.so -adb push ./libMNN_GL.so /data/local/tmp/MNN/libMNN_GL.so -adb push ./libMNN_Express.so /data/local/tmp/MNN/libMNN_Express.so -adb push ./MNNV2Basic.out /data/local/tmp/MNN/MNNV2Basic.out -adb shell "cd /data/local/tmp/MNN && rm -r output" -adb shell "cd /data/local/tmp/MNN && mkdir output" -adb push ./unitTest.out /data/local/tmp/MNN/unitTest.out -adb push ./testModel.out /data/local/tmp/MNN/testModel.out -adb push ./testModelWithDescrisbe.out /data/local/tmp/MNN/testModelWithDescrisbe.out -adb push ./backendTest.out /data/local/tmp/MNN/backendTest.out -adb push ./timeProfile.out /data/local/tmp/MNN/timeProfile.out +DIR=MNN -adb push ./train.out /data/local/tmp/MNN/train.out -adb push ./benchmark.out /data/local/tmp/MNN/benchmark.out -adb push ./benchmarkExprModels.out /data/local/tmp/MNN/benchmarkExprModels.out -adb push ./run_test.out /data/local/tmp/MNN/run_test.out +make -j16 +adb push ./libMNN.so /data/local/tmp/$DIR/libMNN.so +adb push ./libMNN_CL.so /data/local/tmp/$DIR/libMNN_CL.so +adb push ./libMNN_Vulkan.so /data/local/tmp/$DIR/libMNN_Vulkan.so +adb push ./libMNN_GL.so /data/local/tmp/$DIR/libMNN_GL.so +adb push ./libMNN_Express.so /data/local/tmp/$DIR/libMNN_Express.so +adb push ./MNNV2Basic.out /data/local/tmp/$DIR/MNNV2Basic.out +adb shell "cd /data/local/tmp/$DIR && rm -r output" +adb shell "cd /data/local/tmp/$DIR && mkdir output" +adb push ./unitTest.out /data/local/tmp/$DIR/unitTest.out +adb push ./testModel.out /data/local/tmp/$DIR/testModel.out +adb push ./testModelWithDescrisbe.out /data/local/tmp/$DIR/testModelWithDescrisbe.out +adb push ./backendTest.out /data/local/tmp/$DIR/backendTest.out +adb push ./timeProfile.out /data/local/tmp/$DIR/timeProfile.out + +adb push ./train.out /data/local/tmp/$DIR/train.out +adb push ./benchmark.out /data/local/tmp/$DIR/benchmark.out +adb push ./benchmarkExprModels.out /data/local/tmp/$DIR/benchmarkExprModels.out +adb push ./run_test.out /data/local/tmp/$DIR/run_test.out diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj index 78db6d8d..f79c12d0 100644 --- a/project/ios/MNN.xcodeproj/project.pbxproj +++ b/project/ios/MNN.xcodeproj/project.pbxproj @@ -39,6 +39,16 @@ 4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4819FB3624C69E680050BD09 /* GeometrySpatialProduct.cpp */; }; 4819FB3C24C69E680050BD09 /* GeometryBatchMatMul.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4819FB3724C69E680050BD09 /* GeometryBatchMatMul.cpp */; }; 4819FB3D24C69E680050BD09 /* GeometryCosineSimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4819FB3824C69E680050BD09 /* GeometryCosineSimilarity.cpp */; }; + 481C2DEC25FE2CD6001ED6DF /* Arm82WinogradOptFunc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DE225FE2CD5001ED6DF /* Arm82WinogradOptFunc.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 481C2DED25FE2CD6001ED6DF /* Arm82WinogradOptFunc.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE325FE2CD5001ED6DF /* Arm82WinogradOptFunc.hpp */; }; + 481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE425FE2CD6001ED6DF /* Arm82Functions.hpp */; }; + 481C2DEF25FE2CD6001ED6DF /* Arm82Moments.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE525FE2CD6001ED6DF /* Arm82Moments.hpp */; }; + 481C2DF025FE2CD6001ED6DF /* Arm82Functions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DE625FE2CD6001ED6DF /* Arm82Functions.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 481C2DF125FE2CD6001ED6DF /* Arm82OptFunc.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE725FE2CD6001ED6DF /* Arm82OptFunc.hpp */; }; + 481C2DF225FE2CD6001ED6DF /* Arm82InstanceNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481C2DE825FE2CD6001ED6DF /* Arm82InstanceNorm.hpp */; }; + 481C2DF325FE2CD6001ED6DF /* Arm82InstanceNorm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DE925FE2CD6001ED6DF /* Arm82InstanceNorm.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 481C2DF425FE2CD6001ED6DF /* Arm82Moments.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DEA25FE2CD6001ED6DF /* Arm82Moments.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481C2DEB25FE2CD6001ED6DF /* Arm82OptFunc.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 481FA848259C24A00047F01F /* CPUConvArm82Int8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481FA846259C24A00047F01F /* CPUConvArm82Int8.cpp */; }; 481FA849259C24A00047F01F /* CPUConvArm82Int8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 481FA847259C24A00047F01F /* CPUConvArm82Int8.hpp */; }; 481FA84F259C27B30047F01F /* GeometryTensorArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 481FA84E259C27B30047F01F /* GeometryTensorArray.cpp */; }; @@ -56,10 +66,15 @@ 4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4836CEE4257744120068F6CE /* ShapePlugin.cpp */; }; 4837147225A599EC004DBDED /* Arm82Binary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4837147025A599EC004DBDED /* Arm82Binary.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 4837147325A599EC004DBDED /* Arm82Binary.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4837147125A599EC004DBDED /* Arm82Binary.hpp */; }; + 4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4838EA7A2611BFE20027232C /* CPUGridSample.hpp */; }; + 4838EA7D2611BFE20027232C /* CPUGridSample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA7B2611BFE20027232C /* CPUGridSample.cpp */; }; + 4838EA832611C00B0027232C /* MetalGridSample.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4838EA802611C00B0027232C /* MetalGridSample.hpp */; }; + 4838EA842611C00B0027232C /* MetalGridSample.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA812611C00B0027232C /* MetalGridSample.metal */; }; + 4838EA852611C00B0027232C /* MetalGridSample.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA822611C00B0027232C /* MetalGridSample.mm */; }; + 4838EA8B2611C1310027232C /* ShapeGridSample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4838EA8A2611C1310027232C /* ShapeGridSample.cpp */; }; 48417FF024D13BF50056D9A7 /* GeometryThreshold.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */; }; 48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FED24D13BF50056D9A7 /* GeometryELU.cpp */; }; 48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */; }; - 48417FF324D13BF50056D9A7 /* GeometryTanH.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48417FEF24D13BF50056D9A7 /* GeometryTanH.cpp */; }; 48608B51250632EC00CB1D71 /* GeometryComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48608B4D250632EC00CB1D71 /* GeometryComputer.cpp */; }; 48608B52250632EC00CB1D71 /* GeometryComputerUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48608B4E250632EC00CB1D71 /* GeometryComputerUtils.cpp */; }; 48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48608B4F250632EC00CB1D71 /* GeometryComputerUtils.hpp */; }; @@ -97,7 +112,6 @@ 4882C8E2241A24D900DAC168 /* Pool3DTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C1241A24D700DAC168 /* Pool3DTest.cpp */; }; 4882C8E3241A24D900DAC168 /* MultiConvolutionTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C2241A24D700DAC168 /* MultiConvolutionTest.cpp */; }; 4882C8E4241A24D900DAC168 /* Dilation2DTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C3241A24D700DAC168 /* Dilation2DTest.cpp */; }; - 4882C8E5241A24D900DAC168 /* SoftmaxGradTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C4241A24D700DAC168 /* SoftmaxGradTest.cpp */; }; 4882C8E6241A24D900DAC168 /* ZerosLikeTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C5241A24D700DAC168 /* ZerosLikeTest.cpp */; }; 4882C8E7241A24D900DAC168 /* ConvInt8Test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C6241A24D700DAC168 /* ConvInt8Test.cpp */; }; 4882C8E8241A24D900DAC168 /* UnravelIndexTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8C7241A24D700DAC168 /* UnravelIndexTest.cpp */; }; @@ -113,7 +127,6 @@ 4882C8F2241A24D900DAC168 /* StackTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D1241A24D800DAC168 /* StackTest.cpp */; }; 4882C8F3241A24D900DAC168 /* MatrixBandPart.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D2241A24D800DAC168 /* MatrixBandPart.cpp */; }; 4882C8F4241A24D900DAC168 /* MomentsTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D3241A24D800DAC168 /* MomentsTest.cpp */; }; - 4882C8F5241A24D900DAC168 /* ReluGradTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D4241A24D800DAC168 /* ReluGradTest.cpp */; }; 4882C8F6241A24D900DAC168 /* BroadcastToTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D5241A24D800DAC168 /* BroadcastToTest.cpp */; }; 4882C8F7241A24D900DAC168 /* ArgMaxTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D6241A24D900DAC168 /* ArgMaxTest.cpp */; }; 4882C8F8241A24D900DAC168 /* SetDiff1DTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4882C8D7241A24D900DAC168 /* SetDiff1DTest.cpp */; }; @@ -127,6 +140,17 @@ 488F1158247BB2A0008E85C6 /* Arm82Raster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 488F1156247BB2A0008E85C6 /* Arm82Raster.cpp */; }; 488F1159247BB2A0008E85C6 /* Arm82Raster.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 488F1157247BB2A0008E85C6 /* Arm82Raster.hpp */; }; 489404DE24A2FC2C001E456C /* GeometryReverseSequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */; }; + 4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4896D36425FE2A3C00717702 /* Arm82Unary.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4896D36525FE2A3C00717702 /* Arm82Unary.hpp */; }; + 4896D36D25FE2A3D00717702 /* Arm82Vec.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 4896D36825FE2A3D00717702 /* Arm82Vec.hpp */; }; + 4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37D25FE2A6B00717702 /* MNNPackC8FP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37525FE2A6B00717702 /* MNNPackC8FP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + 4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A162550FDC800AD896A /* MetalReLU6.metal */; }; 489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; }; 489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */; }; @@ -201,6 +225,7 @@ 489D7AB62550FDC900AD896A /* MetalReLU6.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A652550FDC800AD896A /* MetalReLU6.mm */; }; 489D7AB72550FDC900AD896A /* MetalEltwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A662550FDC800AD896A /* MetalEltwise.metal */; }; 489D7AC52550FF9F00AD896A /* ExecutorScope.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */; }; + 48A046FC25E4ABAC00CFA868 /* GeometryUnary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A046FB25E4ABAC00CFA868 /* GeometryUnary.cpp */; }; 48A8A61221D101A700C2B9A7 /* ImageProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A8A60B21D101A700C2B9A7 /* ImageProcess.cpp */; }; 48A8A61321D101A700C2B9A7 /* ImageSampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A8A60C21D101A700C2B9A7 /* ImageSampler.cpp */; }; 48A8A61421D101A700C2B9A7 /* ImageBlitter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48A8A60D21D101A700C2B9A7 /* ImageBlitter.cpp */; }; @@ -212,7 +237,6 @@ 48C84B6C250F709E00EE7666 /* SizeComputer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B6A250F709E00EE7666 /* SizeComputer.hpp */; }; 48C84B6D250F709E00EE7666 /* SizeComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B6B250F709E00EE7666 /* SizeComputer.cpp */; }; 48C84B80250F711700EE7666 /* Distributions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B6E250F711600EE7666 /* Distributions.cpp */; }; - 48C84B81250F711700EE7666 /* FixModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B70250F711600EE7666 /* FixModule.cpp */; }; 48C84B82250F711700EE7666 /* PipelineModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B71250F711600EE7666 /* PipelineModule.cpp */; }; 48C84B83250F711700EE7666 /* Module.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B72250F711600EE7666 /* Module.cpp */; }; 48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B73250F711600EE7666 /* WhileModule.hpp */; }; @@ -221,7 +245,6 @@ 48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B76250F711600EE7666 /* WhileModule.cpp */; }; 48C84B88250F711700EE7666 /* IfModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B77250F711600EE7666 /* IfModule.cpp */; }; 48C84B89250F711700EE7666 /* StaticModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B78250F711600EE7666 /* StaticModule.hpp */; }; - 48C84B8A250F711700EE7666 /* FixModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B79250F711600EE7666 /* FixModule.hpp */; }; 48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 48C84B7A250F711600EE7666 /* PipelineModule.hpp */; }; 48C84B8C250F711700EE7666 /* NN.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B7B250F711600EE7666 /* NN.cpp */; }; 48C84B8D250F711700EE7666 /* Initializer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48C84B7C250F711600EE7666 /* Initializer.cpp */; }; @@ -263,6 +286,7 @@ 48FD034A246AA40300456AF5 /* GeometryConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD0349246AA40300456AF5 /* GeometryConvert.cpp */; }; 48FD12BE2466A88D009E9102 /* GeometryImageOp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */; }; 48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */; }; + 4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D759B2B25FF89EE0037B0B6 /* GeometryShape.cpp */; }; 6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */; }; 6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 6A131E3E25823349002EC3D6 /* PluginKernel.cpp */; }; 9200049921EDBDF600BCE892 /* TensorTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9200045D21EDBDF600BCE892 /* TensorTest.cpp */; }; @@ -306,8 +330,6 @@ 92A4E10321F07C76000B0919 /* AutoStorageTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */; }; 92C674FF22549C9900011D33 /* ReLU6Test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92C674FD22549C9900011D33 /* ReLU6Test.cpp */; }; 92D765BB222819EF00178BE5 /* BackendTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92D765B8222819EF00178BE5 /* BackendTest.cpp */; }; - 92D765BD222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92D765BA222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp */; }; - 92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */; }; 92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */; }; 92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */; }; 92FF025C23AA0B5A00AC97F6 /* CPUGatherV2.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */; }; @@ -335,9 +357,6 @@ 92FF027C23AA0B5A00AC97F6 /* CPUAsString.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00FA23AA0B4A00AC97F6 /* CPUAsString.hpp */; }; 92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00FD23AA0B4A00AC97F6 /* CPUDeconvolutionDepthwise.cpp */; }; 92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00FE23AA0B4B00AC97F6 /* CPUFloatToInt8.hpp */; }; - 92FF028223AA0B5A00AC97F6 /* CPUSoftmaxGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010023AA0B4B00AC97F6 /* CPUSoftmaxGrad.hpp */; }; - 92FF028323AA0B5A00AC97F6 /* CPUSize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010123AA0B4B00AC97F6 /* CPUSize.hpp */; }; - 92FF028423AA0B5A00AC97F6 /* CPUPriorbox.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */; }; 92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010423AA0B4B00AC97F6 /* CPUDeconvolution.hpp */; }; 92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010523AA0B4B00AC97F6 /* CPUFixedPoint.hpp */; }; 92FF028823AA0B5A00AC97F6 /* CPUDequantize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF010623AA0B4B00AC97F6 /* CPUDequantize.hpp */; }; @@ -353,9 +372,7 @@ 92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF011823AA0B4C00AC97F6 /* CPUQuantizedMaxPool.cpp */; }; 92FF029B23AA0B5A00AC97F6 /* CPUScale.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */; }; 92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011C23AA0B4D00AC97F6 /* CPUDeconvolutionDepthwise.hpp */; }; - 92FF029F23AA0B5A00AC97F6 /* CPUReluGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011D23AA0B4D00AC97F6 /* CPUReluGrad.hpp */; }; 92FF02A123AA0B5A00AC97F6 /* CPUDepthwiseConvInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF011F23AA0B4D00AC97F6 /* CPUDepthwiseConvInt8.hpp */; }; - 92FF02A223AA0B5A00AC97F6 /* CPUSize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF012023AA0B4D00AC97F6 /* CPUSize.cpp */; }; 92FF02A323AA0B5A00AC97F6 /* CPUQuantizedLogistic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF012123AA0B4D00AC97F6 /* CPUQuantizedLogistic.cpp */; }; 92FF02A423AA0B5A00AC97F6 /* CPUBinary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF012223AA0B4D00AC97F6 /* CPUBinary.cpp */; }; 92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF012423AA0B4D00AC97F6 /* CPUQuantizedMaxPool.hpp */; }; @@ -372,7 +389,6 @@ 92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013423AA0B4E00AC97F6 /* CPUUnary.cpp */; }; 92FF02B723AA0B5A00AC97F6 /* CPUQuantizedAdd.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF013523AA0B4E00AC97F6 /* CPUQuantizedAdd.hpp */; }; 92FF02B823AA0B5A00AC97F6 /* CPUWhere.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013623AA0B4E00AC97F6 /* CPUWhere.cpp */; }; - 92FF02B923AA0B5A00AC97F6 /* CPUSoftmaxGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013723AA0B4E00AC97F6 /* CPUSoftmaxGrad.cpp */; }; 92FF02BB23AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */; }; 92FF02BC23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */; }; 92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */; }; @@ -381,7 +397,6 @@ 92FF02C023AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014023AA0B4E00AC97F6 /* MNNAddC4WithStride.S */; }; 92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014123AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */; }; 92FF02C223AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014223AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */; }; - 92FF02C423AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014423AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */; }; 92FF02C523AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014523AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */; }; 92FF02C623AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014623AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */; }; 92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014723AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */; }; @@ -389,7 +404,6 @@ 92FF02C923AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014923AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */; }; 92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014A23AA0B4E00AC97F6 /* MNNUnPackC4.S */; }; 92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014B23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */; }; - 92FF02CC23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014C23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */; }; 92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014D23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */; }; 92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014E23AA0B4E00AC97F6 /* MNNPackC4.S */; }; 92FF02CF23AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF014F23AA0B4E00AC97F6 /* MNNMinFloat.S */; }; @@ -409,16 +423,13 @@ 92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016123AA0B4E00AC97F6 /* MNNPowC8.S */; }; 92FF02E223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */; }; 92FF02E323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */; }; - 92FF02E423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016423AA0B4E00AC97F6 /* MNNAddBiasRelu.S */; }; 92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */; }; 92FF02E623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */; }; 92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */; }; 92FF02E823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */; }; 92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */; }; - 92FF02EB23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016B23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */; }; 92FF02EC23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */; }; 92FF02EE23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */; }; - 92FF02EF23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF016F23AA0B4E00AC97F6 /* MNNAddBias.S */; }; 92FF02F123AA0B5A00AC97F6 /* MNNCoefLine.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017123AA0B4E00AC97F6 /* MNNCoefLine.S */; }; 92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; }; 92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; }; @@ -427,7 +438,6 @@ 92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; }; 92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; }; 92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; }; - 92FF02FA23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017A23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */; }; 92FF02FC23AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017D23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */; }; 92FF02FD23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017E23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */; }; 92FF02FE23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017F23AA0B4E00AC97F6 /* MNNMatrixProd.S */; }; @@ -436,7 +446,6 @@ 92FF030123AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018223AA0B4E00AC97F6 /* MNNAddC4WithStride.S */; }; 92FF030223AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018323AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */; }; 92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018423AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */; }; - 92FF030523AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018623AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */; }; 92FF030623AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018723AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */; }; 92FF030723AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018823AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */; }; 92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018923AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */; }; @@ -444,7 +453,6 @@ 92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018B23AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */; }; 92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018C23AA0B4E00AC97F6 /* MNNUnPackC4.S */; }; 92FF030C23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018D23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */; }; - 92FF030D23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018E23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */; }; 92FF030E23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF018F23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */; }; 92FF030F23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF019023AA0B4E00AC97F6 /* MNNPackC4.S */; }; 92FF031023AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF019123AA0B4E00AC97F6 /* MNNMinFloat.S */; }; @@ -464,16 +472,13 @@ 92FF032123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A223AA0B4E00AC97F6 /* MNNPowC8.S */; }; 92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */; }; 92FF032323AA0B5A00AC97F6 /* MNNExpC8.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */; }; - 92FF032423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A523AA0B4E00AC97F6 /* MNNAddBiasRelu.S */; }; 92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */; }; 92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */; }; 92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */; }; 92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */; }; 92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */; }; - 92FF032B23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AC23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */; }; 92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */; }; 92FF032E23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */; }; - 92FF032F23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B023AA0B4E00AC97F6 /* MNNAddBias.S */; }; 92FF033123AA0B5A00AC97F6 /* MNNCoefLine.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B223AA0B4E00AC97F6 /* MNNCoefLine.S */; }; 92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; }; 92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; }; @@ -482,17 +487,12 @@ 92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; }; 92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; }; 92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; }; - 92FF033A23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BB23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */; }; 92FF033C23AA0B5A00AC97F6 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01BD23AA0B4E00AC97F6 /* MNNAsmGlobal.h */; }; - 92FF033D23AA0B5A00AC97F6 /* CPUReluGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BE23AA0B4E00AC97F6 /* CPUReluGrad.cpp */; }; 92FF033F23AA0B5A00AC97F6 /* CPUArgMax.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C023AA0B4E00AC97F6 /* CPUArgMax.hpp */; }; - 92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01C123AA0B4E00AC97F6 /* CPUShape.cpp */; }; - 92FF034123AA0B5A00AC97F6 /* CPURank.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C223AA0B4E00AC97F6 /* CPURank.hpp */; }; 92FF034223AA0B5A00AC97F6 /* CPUReduction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01C323AA0B4F00AC97F6 /* CPUReduction.cpp */; }; 92FF034423AA0B5A00AC97F6 /* CPUGatherND.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01C523AA0B4F00AC97F6 /* CPUGatherND.cpp */; }; 92FF034523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C623AA0B4F00AC97F6 /* CPUQuantizedAvgPool.hpp */; }; 92FF034623AA0B5A00AC97F6 /* CPUGatherND.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01C723AA0B4F00AC97F6 /* CPUGatherND.hpp */; }; - 92FF034A23AA0B5A00AC97F6 /* CPUTanh.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01CB23AA0B4F00AC97F6 /* CPUTanh.hpp */; }; 92FF034C23AA0B5A00AC97F6 /* CPUSetDiff1D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01CD23AA0B4F00AC97F6 /* CPUSetDiff1D.hpp */; }; 92FF034D23AA0B5A00AC97F6 /* CPUCast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01CE23AA0B4F00AC97F6 /* CPUCast.cpp */; }; 92FF035023AA0B5A00AC97F6 /* CPUOneHot.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01D123AA0B4F00AC97F6 /* CPUOneHot.hpp */; }; @@ -504,7 +504,6 @@ 92FF035923AA0B5A00AC97F6 /* CPUAsString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01DA23AA0B5000AC97F6 /* CPUAsString.cpp */; }; 92FF035A23AA0B5A00AC97F6 /* CPUDetectionPostProcess.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01DB23AA0B5000AC97F6 /* CPUDetectionPostProcess.hpp */; }; 92FF035B23AA0B5A00AC97F6 /* CPURelu.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01DC23AA0B5000AC97F6 /* CPURelu.hpp */; }; - 92FF035F23AA0B5A00AC97F6 /* CPUShape.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01E023AA0B5000AC97F6 /* CPUShape.hpp */; }; 92FF036323AA0B5A00AC97F6 /* CPUScale.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */; }; 92FF036423AA0B5A00AC97F6 /* CPUUnravelIndex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01E523AA0B5100AC97F6 /* CPUUnravelIndex.cpp */; }; 92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01E623AA0B5100AC97F6 /* CPUResize.hpp */; }; @@ -521,7 +520,6 @@ 92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01F823AA0B5200AC97F6 /* CPUConvolutionDepthwise.cpp */; }; 92FF037823AA0B5A00AC97F6 /* CPUROIPooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01F923AA0B5200AC97F6 /* CPUROIPooling.hpp */; }; 92FF037923AA0B5A00AC97F6 /* CPUInstanceNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF01FA23AA0B5200AC97F6 /* CPUInstanceNorm.hpp */; }; - 92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */; }; 92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */; }; 92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */; }; 92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */; }; @@ -529,12 +527,10 @@ 92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */; }; 92FF038723AA0B5A00AC97F6 /* CPUTensorConvert.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020823AA0B5300AC97F6 /* CPUTensorConvert.hpp */; }; 92FF038823AA0B5A00AC97F6 /* CPUQuantizedLogistic.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020923AA0B5300AC97F6 /* CPUQuantizedLogistic.hpp */; }; - 92FF038923AA0B5A00AC97F6 /* CPUSigmoid.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020A23AA0B5300AC97F6 /* CPUSigmoid.hpp */; }; 92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020B23AA0B5300AC97F6 /* CPURange.cpp */; }; 92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020C23AA0B5500AC97F6 /* CPUUnravelIndex.hpp */; }; 92FF038C23AA0B5A00AC97F6 /* CPUEltwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020D23AA0B5500AC97F6 /* CPUEltwise.hpp */; }; 92FF038D23AA0B5A00AC97F6 /* CPUMatrixBandPart.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF020E23AA0B5500AC97F6 /* CPUMatrixBandPart.hpp */; }; - 92FF038F23AA0B5A00AC97F6 /* CPUPriorbox.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF021023AA0B5500AC97F6 /* CPUPriorbox.hpp */; }; 92FF039123AA0B5A00AC97F6 /* CPUBackend.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF021223AA0B5600AC97F6 /* CPUBackend.hpp */; }; 92FF039223AA0B5A00AC97F6 /* CPUDeconvolution.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF021323AA0B5600AC97F6 /* CPUDeconvolution.cpp */; }; 92FF039323AA0B5A00AC97F6 /* CPUQuantizedAdd.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF021423AA0B5600AC97F6 /* CPUQuantizedAdd.cpp */; }; @@ -571,7 +567,6 @@ 92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */; }; 92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */; }; 92FF03BF23AA0B5A00AC97F6 /* ConvolutionTiledExecutor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */; }; - 92FF03C123AA0B5A00AC97F6 /* CPURank.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024323AA0B5600AC97F6 /* CPURank.cpp */; }; 92FF03C323AA0B5A00AC97F6 /* CPUEltwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024523AA0B5700AC97F6 /* CPUEltwise.cpp */; }; 92FF03C423AA0B5A00AC97F6 /* CPUInterp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF024623AA0B5700AC97F6 /* CPUInterp.cpp */; }; 92FF03C523AA0B5A00AC97F6 /* CPUReduceJoin.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF024723AA0B5700AC97F6 /* CPUReduceJoin.hpp */; }; @@ -661,7 +656,6 @@ 92FF04AD23AA0BFB00AC97F6 /* Execution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF048C23AA0BFA00AC97F6 /* Execution.hpp */; }; 92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF048D23AA0BFA00AC97F6 /* Backend.cpp */; }; 92FF04AF23AA0BFB00AC97F6 /* Macro.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF048E23AA0BFA00AC97F6 /* Macro.h */; }; - 92FF04B123AA0BFB00AC97F6 /* DirectedAcyclicGraph.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF049023AA0BFA00AC97F6 /* DirectedAcyclicGraph.hpp */; }; 92FF04B323AA0BFB00AC97F6 /* Schedule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF049223AA0BFA00AC97F6 /* Schedule.cpp */; }; 92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF049323AA0BFA00AC97F6 /* MNNMemoryUtils.h */; }; 92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF049423AA0BFA00AC97F6 /* TensorUtils.hpp */; }; @@ -706,27 +700,16 @@ EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBD484242485FF640083CE95 /* Arm82Interp.hpp */; }; EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBD484292485FF650083CE95 /* Arm82Interp.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA37A24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+dotprod"; }; }; - EBECA38E24643D320062C7A3 /* Arm82ConvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA37C24643D300062C7A3 /* Arm82ConvolutionDepthwise.hpp */; }; - EBECA38F24643D320062C7A3 /* Arm82Convolution.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA37D24643D300062C7A3 /* Arm82Convolution.cpp */; }; - EBECA39024643D320062C7A3 /* Arm82ConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA37E24643D300062C7A3 /* Arm82ConvolutionDepthwise.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA39224643D320062C7A3 /* Arm82Pooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38024643D300062C7A3 /* Arm82Pooling.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA39324643D320062C7A3 /* Arm82Pooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38124643D310062C7A3 /* Arm82Pooling.hpp */; }; - EBECA39424643D320062C7A3 /* Arm82Convolution3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38224643D310062C7A3 /* Arm82Convolution3x3.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA39524643D320062C7A3 /* Arm82Backend.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38324643D310062C7A3 /* Arm82Backend.hpp */; }; EBECA39624643D320062C7A3 /* Arm82Eltwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38424643D310062C7A3 /* Arm82Eltwise.hpp */; }; EBECA39724643D320062C7A3 /* Arm82Eltwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38524643D310062C7A3 /* Arm82Eltwise.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA39824643D320062C7A3 /* Arm82Relu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38624643D310062C7A3 /* Arm82Relu.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38724643D310062C7A3 /* Arm82Relu.hpp */; }; - EBECA39A24643D320062C7A3 /* Arm82Convolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38824643D310062C7A3 /* Arm82Convolution.hpp */; }; EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38924643D310062C7A3 /* Arm82Backend.cpp */; }; - EBECA39C24643D320062C7A3 /* Arm82Convolution3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38A24643D310062C7A3 /* Arm82Convolution3x3.hpp */; }; - EBECA39D24643D320062C7A3 /* Arm82OptFunc.hpp in Headers */ = {isa = PBXBuildFile; fileRef = EBECA38B24643D310062C7A3 /* Arm82OptFunc.hpp */; }; - EBECA39F24643D320062C7A3 /* Arm82OptFunc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38D24643D320062C7A3 /* Arm82OptFunc.cpp */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */; }; - EBECA3A624643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A224643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; - EBECA3A824643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A424643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; - EBECA3A924643D5D0062C7A3 /* MNNShuffleChannelC8.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A524643D5D0062C7A3 /* MNNShuffleChannelC8.S */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -773,6 +756,16 @@ 4819FB3624C69E680050BD09 /* GeometrySpatialProduct.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometrySpatialProduct.cpp; sourceTree = ""; }; 4819FB3724C69E680050BD09 /* GeometryBatchMatMul.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryBatchMatMul.cpp; sourceTree = ""; }; 4819FB3824C69E680050BD09 /* GeometryCosineSimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCosineSimilarity.cpp; sourceTree = ""; }; + 481C2DE225FE2CD5001ED6DF /* Arm82WinogradOptFunc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82WinogradOptFunc.cpp; path = ../arm82/Arm82WinogradOptFunc.cpp; sourceTree = ""; }; + 481C2DE325FE2CD5001ED6DF /* Arm82WinogradOptFunc.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82WinogradOptFunc.hpp; path = ../arm82/Arm82WinogradOptFunc.hpp; sourceTree = ""; }; + 481C2DE425FE2CD6001ED6DF /* Arm82Functions.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Functions.hpp; path = ../arm82/Arm82Functions.hpp; sourceTree = ""; }; + 481C2DE525FE2CD6001ED6DF /* Arm82Moments.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Moments.hpp; path = ../arm82/Arm82Moments.hpp; sourceTree = ""; }; + 481C2DE625FE2CD6001ED6DF /* Arm82Functions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Functions.cpp; path = ../arm82/Arm82Functions.cpp; sourceTree = ""; }; + 481C2DE725FE2CD6001ED6DF /* Arm82OptFunc.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82OptFunc.hpp; path = ../arm82/Arm82OptFunc.hpp; sourceTree = ""; }; + 481C2DE825FE2CD6001ED6DF /* Arm82InstanceNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82InstanceNorm.hpp; path = ../arm82/Arm82InstanceNorm.hpp; sourceTree = ""; }; + 481C2DE925FE2CD6001ED6DF /* Arm82InstanceNorm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82InstanceNorm.cpp; path = ../arm82/Arm82InstanceNorm.cpp; sourceTree = ""; }; + 481C2DEA25FE2CD6001ED6DF /* Arm82Moments.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Moments.cpp; path = ../arm82/Arm82Moments.cpp; sourceTree = ""; }; + 481C2DEB25FE2CD6001ED6DF /* Arm82OptFunc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OptFunc.cpp; path = ../arm82/Arm82OptFunc.cpp; sourceTree = ""; }; 481FA846259C24A00047F01F /* CPUConvArm82Int8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUConvArm82Int8.cpp; sourceTree = ""; }; 481FA847259C24A00047F01F /* CPUConvArm82Int8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUConvArm82Int8.hpp; sourceTree = ""; }; 481FA84E259C27B30047F01F /* GeometryTensorArray.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryTensorArray.cpp; sourceTree = ""; }; @@ -790,10 +783,15 @@ 4836CEE4257744120068F6CE /* ShapePlugin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapePlugin.cpp; sourceTree = ""; }; 4837147025A599EC004DBDED /* Arm82Binary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Binary.cpp; path = ../arm82/Arm82Binary.cpp; sourceTree = ""; }; 4837147125A599EC004DBDED /* Arm82Binary.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Binary.hpp; path = ../arm82/Arm82Binary.hpp; sourceTree = ""; }; + 4838EA7A2611BFE20027232C /* CPUGridSample.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGridSample.hpp; sourceTree = ""; }; + 4838EA7B2611BFE20027232C /* CPUGridSample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUGridSample.cpp; sourceTree = ""; }; + 4838EA802611C00B0027232C /* MetalGridSample.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalGridSample.hpp; sourceTree = ""; }; + 4838EA812611C00B0027232C /* MetalGridSample.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalGridSample.metal; sourceTree = ""; }; + 4838EA822611C00B0027232C /* MetalGridSample.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalGridSample.mm; sourceTree = ""; }; + 4838EA8A2611C1310027232C /* ShapeGridSample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeGridSample.cpp; sourceTree = ""; }; 48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryThreshold.cpp; sourceTree = ""; }; 48417FED24D13BF50056D9A7 /* GeometryELU.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryELU.cpp; sourceTree = ""; }; 48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometrySelect.cpp; sourceTree = ""; }; - 48417FEF24D13BF50056D9A7 /* GeometryTanH.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryTanH.cpp; sourceTree = ""; }; 48608B4D250632EC00CB1D71 /* GeometryComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryComputer.cpp; sourceTree = ""; }; 48608B4E250632EC00CB1D71 /* GeometryComputerUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryComputerUtils.cpp; sourceTree = ""; }; 48608B4F250632EC00CB1D71 /* GeometryComputerUtils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = GeometryComputerUtils.hpp; sourceTree = ""; }; @@ -831,7 +829,6 @@ 4882C8C1241A24D700DAC168 /* Pool3DTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Pool3DTest.cpp; sourceTree = ""; }; 4882C8C2241A24D700DAC168 /* MultiConvolutionTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MultiConvolutionTest.cpp; sourceTree = ""; }; 4882C8C3241A24D700DAC168 /* Dilation2DTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Dilation2DTest.cpp; sourceTree = ""; }; - 4882C8C4241A24D700DAC168 /* SoftmaxGradTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SoftmaxGradTest.cpp; sourceTree = ""; }; 4882C8C5241A24D700DAC168 /* ZerosLikeTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ZerosLikeTest.cpp; sourceTree = ""; }; 4882C8C6241A24D700DAC168 /* ConvInt8Test.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvInt8Test.cpp; sourceTree = ""; }; 4882C8C7241A24D700DAC168 /* UnravelIndexTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UnravelIndexTest.cpp; sourceTree = ""; }; @@ -847,7 +844,6 @@ 4882C8D1241A24D800DAC168 /* StackTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StackTest.cpp; sourceTree = ""; }; 4882C8D2241A24D800DAC168 /* MatrixBandPart.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MatrixBandPart.cpp; sourceTree = ""; }; 4882C8D3241A24D800DAC168 /* MomentsTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MomentsTest.cpp; sourceTree = ""; }; - 4882C8D4241A24D800DAC168 /* ReluGradTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ReluGradTest.cpp; sourceTree = ""; }; 4882C8D5241A24D800DAC168 /* BroadcastToTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BroadcastToTest.cpp; sourceTree = ""; }; 4882C8D6241A24D900DAC168 /* ArgMaxTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ArgMaxTest.cpp; sourceTree = ""; }; 4882C8D7241A24D900DAC168 /* SetDiff1DTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SetDiff1DTest.cpp; sourceTree = ""; }; @@ -861,6 +857,17 @@ 488F1156247BB2A0008E85C6 /* Arm82Raster.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Raster.cpp; path = ../arm82/Arm82Raster.cpp; sourceTree = ""; }; 488F1157247BB2A0008E85C6 /* Arm82Raster.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; name = Arm82Raster.hpp; path = ../arm82/Arm82Raster.hpp; sourceTree = ""; }; 489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryReverseSequence.cpp; sourceTree = ""; }; + 4896D36425FE2A3C00717702 /* Arm82Unary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Unary.cpp; path = ../arm82/Arm82Unary.cpp; sourceTree = ""; }; + 4896D36525FE2A3C00717702 /* Arm82Unary.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Unary.hpp; path = ../arm82/Arm82Unary.hpp; sourceTree = ""; }; + 4896D36825FE2A3D00717702 /* Arm82Vec.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Vec.hpp; path = ../arm82/Arm82Vec.hpp; sourceTree = ""; }; + 4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = ""; }; + 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = ""; }; + 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = ""; }; + 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = ""; }; + 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = ""; }; + 4896D37525FE2A6B00717702 /* MNNPackC8FP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackC8FP16.S; path = ../../../arm82/asm/arm64/MNNPackC8FP16.S; sourceTree = ""; }; + 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = ""; }; + 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = ""; }; 489D7A162550FDC800AD896A /* MetalReLU6.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU6.metal; sourceTree = ""; }; 489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = ""; }; 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionGEMM.hpp; sourceTree = ""; }; @@ -935,6 +942,7 @@ 489D7A652550FDC800AD896A /* MetalReLU6.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalReLU6.mm; sourceTree = ""; }; 489D7A662550FDC800AD896A /* MetalEltwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalEltwise.metal; sourceTree = ""; }; 489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExecutorScope.cpp; sourceTree = ""; }; + 48A046FB25E4ABAC00CFA868 /* GeometryUnary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryUnary.cpp; sourceTree = ""; }; 48A8A60B21D101A700C2B9A7 /* ImageProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageProcess.cpp; sourceTree = ""; }; 48A8A60C21D101A700C2B9A7 /* ImageSampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageSampler.cpp; sourceTree = ""; }; 48A8A60D21D101A700C2B9A7 /* ImageBlitter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageBlitter.cpp; sourceTree = ""; }; @@ -946,7 +954,6 @@ 48C84B6A250F709E00EE7666 /* SizeComputer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = SizeComputer.hpp; sourceTree = ""; }; 48C84B6B250F709E00EE7666 /* SizeComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SizeComputer.cpp; sourceTree = ""; }; 48C84B6E250F711600EE7666 /* Distributions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Distributions.cpp; sourceTree = ""; }; - 48C84B70250F711600EE7666 /* FixModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FixModule.cpp; sourceTree = ""; }; 48C84B71250F711600EE7666 /* PipelineModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PipelineModule.cpp; sourceTree = ""; }; 48C84B72250F711600EE7666 /* Module.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Module.cpp; sourceTree = ""; }; 48C84B73250F711600EE7666 /* WhileModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WhileModule.hpp; sourceTree = ""; }; @@ -955,7 +962,6 @@ 48C84B76250F711600EE7666 /* WhileModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WhileModule.cpp; sourceTree = ""; }; 48C84B77250F711600EE7666 /* IfModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IfModule.cpp; sourceTree = ""; }; 48C84B78250F711600EE7666 /* StaticModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StaticModule.hpp; sourceTree = ""; }; - 48C84B79250F711600EE7666 /* FixModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = FixModule.hpp; sourceTree = ""; }; 48C84B7A250F711600EE7666 /* PipelineModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = PipelineModule.hpp; sourceTree = ""; }; 48C84B7B250F711600EE7666 /* NN.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NN.cpp; sourceTree = ""; }; 48C84B7C250F711600EE7666 /* Initializer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Initializer.cpp; sourceTree = ""; }; @@ -997,6 +1003,7 @@ 48FD0349246AA40300456AF5 /* GeometryConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConvert.cpp; sourceTree = ""; }; 48FD12BC2466A88C009E9102 /* GeometryImageOp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryImageOp.cpp; sourceTree = ""; }; 48FD12BD2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryConv2DBackPropFilter.cpp; sourceTree = ""; }; + 4D759B2B25FF89EE0037B0B6 /* GeometryShape.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryShape.cpp; sourceTree = ""; }; 6A131E3D25823349002EC3D6 /* PluginShapeInference.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginShapeInference.cpp; sourceTree = ""; }; 6A131E3E25823349002EC3D6 /* PluginKernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PluginKernel.cpp; sourceTree = ""; }; 9200045321EDBCF700BCE892 /* MNNTestSuite.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = MNNTestSuite.h; path = ../../../test/MNNTestSuite.h; sourceTree = ""; }; @@ -1047,8 +1054,6 @@ 92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = AutoStorageTest.cpp; sourceTree = ""; }; 92C674FD22549C9900011D33 /* ReLU6Test.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ReLU6Test.cpp; sourceTree = ""; }; 92D765B8222819EF00178BE5 /* BackendTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BackendTest.cpp; sourceTree = ""; }; - 92D765BA222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DirectedAcyclicGraphTest.cpp; sourceTree = ""; }; - 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUTanh.cpp; sourceTree = ""; }; 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuanConvolutionDepthwise.cpp; sourceTree = ""; }; 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolInt8.cpp; sourceTree = ""; }; 92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherV2.hpp; sourceTree = ""; }; @@ -1076,9 +1081,6 @@ 92FF00FA23AA0B4A00AC97F6 /* CPUAsString.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUAsString.hpp; sourceTree = ""; }; 92FF00FD23AA0B4A00AC97F6 /* CPUDeconvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDeconvolutionDepthwise.cpp; sourceTree = ""; }; 92FF00FE23AA0B4B00AC97F6 /* CPUFloatToInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUFloatToInt8.hpp; sourceTree = ""; }; - 92FF010023AA0B4B00AC97F6 /* CPUSoftmaxGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSoftmaxGrad.hpp; sourceTree = ""; }; - 92FF010123AA0B4B00AC97F6 /* CPUSize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSize.hpp; sourceTree = ""; }; - 92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPriorbox.cpp; sourceTree = ""; }; 92FF010423AA0B4B00AC97F6 /* CPUDeconvolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDeconvolution.hpp; sourceTree = ""; }; 92FF010523AA0B4B00AC97F6 /* CPUFixedPoint.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUFixedPoint.hpp; sourceTree = ""; }; 92FF010623AA0B4B00AC97F6 /* CPUDequantize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDequantize.hpp; sourceTree = ""; }; @@ -1094,9 +1096,7 @@ 92FF011823AA0B4C00AC97F6 /* CPUQuantizedMaxPool.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuantizedMaxPool.cpp; sourceTree = ""; }; 92FF011923AA0B4C00AC97F6 /* CPUScale.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUScale.hpp; sourceTree = ""; }; 92FF011C23AA0B4D00AC97F6 /* CPUDeconvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDeconvolutionDepthwise.hpp; sourceTree = ""; }; - 92FF011D23AA0B4D00AC97F6 /* CPUReluGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUReluGrad.hpp; sourceTree = ""; }; 92FF011F23AA0B4D00AC97F6 /* CPUDepthwiseConvInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDepthwiseConvInt8.hpp; sourceTree = ""; }; - 92FF012023AA0B4D00AC97F6 /* CPUSize.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSize.cpp; sourceTree = ""; }; 92FF012123AA0B4D00AC97F6 /* CPUQuantizedLogistic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuantizedLogistic.cpp; sourceTree = ""; }; 92FF012223AA0B4D00AC97F6 /* CPUBinary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUBinary.cpp; sourceTree = ""; }; 92FF012423AA0B4D00AC97F6 /* CPUQuantizedMaxPool.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedMaxPool.hpp; sourceTree = ""; }; @@ -1113,7 +1113,6 @@ 92FF013423AA0B4E00AC97F6 /* CPUUnary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUUnary.cpp; sourceTree = ""; }; 92FF013523AA0B4E00AC97F6 /* CPUQuantizedAdd.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedAdd.hpp; sourceTree = ""; }; 92FF013623AA0B4E00AC97F6 /* CPUWhere.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUWhere.cpp; sourceTree = ""; }; - 92FF013723AA0B4E00AC97F6 /* CPUSoftmaxGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSoftmaxGrad.cpp; sourceTree = ""; }; 92FF013B23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Fast.S; sourceTree = ""; }; 92FF013C23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAddInt8.S; sourceTree = ""; }; 92FF013D23AA0B4E00AC97F6 /* MNNMatrixProd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixProd.S; sourceTree = ""; }; @@ -1122,7 +1121,6 @@ 92FF014023AA0B4E00AC97F6 /* MNNAddC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddC4WithStride.S; sourceTree = ""; }; 92FF014123AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNQuanToDestUint8.S; sourceTree = ""; }; 92FF014223AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLoadU8AndSum.S; sourceTree = ""; }; - 92FF014423AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu6.S; sourceTree = ""; }; 92FF014523AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNStrassenMergeCFunction.S; sourceTree = ""; }; 92FF014623AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC1ToFloatRGBA.S; sourceTree = ""; }; 92FF014723AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCopyC4WithStride.S; sourceTree = ""; }; @@ -1130,7 +1128,6 @@ 92FF014923AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLineDepthWiseInt8AddBiasScaleUnit.S; sourceTree = ""; }; 92FF014A23AA0B4E00AC97F6 /* MNNUnPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUnPackC4.S; sourceTree = ""; }; 92FF014B23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1NearestOpt.S; sourceTree = ""; }; - 92FF014C23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatCommon_4.S; sourceTree = ""; }; 92FF014D23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNNV21ToRGBUnit.S; sourceTree = ""; }; 92FF014E23AA0B4E00AC97F6 /* MNNPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4.S; sourceTree = ""; }; 92FF014F23AA0B4E00AC97F6 /* MNNMinFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMinFloat.S; sourceTree = ""; }; @@ -1150,16 +1147,13 @@ 92FF016123AA0B4E00AC97F6 /* MNNPowC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPowC8.S; sourceTree = ""; }; 92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixAdd.S; sourceTree = ""; }; 92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNExpC8.S; sourceTree = ""; }; - 92FF016423AA0B4E00AC97F6 /* MNNAddBiasRelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu.S; sourceTree = ""; }; 92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23SourceTransUnit.S; sourceTree = ""; }; 92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductLeft.S; sourceTree = ""; }; 92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDeconvRunForUnitDepthWise.S; sourceTree = ""; }; 92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1BilinearOpt.S; sourceTree = ""; }; 92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit.S; sourceTree = ""; }; - 92FF016B23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatOne_4.S; sourceTree = ""; }; 92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductRight.S; sourceTree = ""; }; 92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannel.S; sourceTree = ""; }; - 92FF016F23AA0B4E00AC97F6 /* MNNAddBias.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBias.S; sourceTree = ""; }; 92FF017123AA0B4E00AC97F6 /* MNNCoefLine.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCoefLine.S; sourceTree = ""; }; 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = ""; }; 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = ""; }; @@ -1168,7 +1162,6 @@ 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = ""; }; 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = ""; }; 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = ""; }; - 92FF017A23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatUnit_4.S; sourceTree = ""; }; 92FF017D23AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Fast.S; sourceTree = ""; }; 92FF017E23AA0B4E00AC97F6 /* MNNScaleAddInt8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNScaleAddInt8.S; sourceTree = ""; }; 92FF017F23AA0B4E00AC97F6 /* MNNMatrixProd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixProd.S; sourceTree = ""; }; @@ -1177,7 +1170,6 @@ 92FF018223AA0B4E00AC97F6 /* MNNAddC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddC4WithStride.S; sourceTree = ""; }; 92FF018323AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNQuanToDestUint8.S; sourceTree = ""; }; 92FF018423AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLoadU8AndSum.S; sourceTree = ""; }; - 92FF018623AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu6.S; sourceTree = ""; }; 92FF018723AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNStrassenMergeCFunction.S; sourceTree = ""; }; 92FF018823AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC1ToFloatRGBA.S; sourceTree = ""; }; 92FF018923AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCopyC4WithStride.S; sourceTree = ""; }; @@ -1185,7 +1177,6 @@ 92FF018B23AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNLineDepthWiseInt8AddBiasScaleUnit.S; sourceTree = ""; }; 92FF018C23AA0B4E00AC97F6 /* MNNUnPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUnPackC4.S; sourceTree = ""; }; 92FF018D23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1NearestOpt.S; sourceTree = ""; }; - 92FF018E23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatCommon_4.S; sourceTree = ""; }; 92FF018F23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNNV21ToRGBUnit.S; sourceTree = ""; }; 92FF019023AA0B4E00AC97F6 /* MNNPackC4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4.S; sourceTree = ""; }; 92FF019123AA0B4E00AC97F6 /* MNNMinFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMinFloat.S; sourceTree = ""; }; @@ -1205,16 +1196,13 @@ 92FF01A223AA0B4E00AC97F6 /* MNNPowC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPowC8.S; sourceTree = ""; }; 92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNMatrixAdd.S; sourceTree = ""; }; 92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNExpC8.S; sourceTree = ""; }; - 92FF01A523AA0B4E00AC97F6 /* MNNAddBiasRelu.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBiasRelu.S; sourceTree = ""; }; 92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23SourceTransUnit.S; sourceTree = ""; }; 92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductLeft.S; sourceTree = ""; }; 92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDeconvRunForUnitDepthWise.S; sourceTree = ""; }; 92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNSamplerC1BilinearOpt.S; sourceTree = ""; }; 92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit.S; sourceTree = ""; }; - 92FF01AC23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatOne_4.S; sourceTree = ""; }; 92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNWinogradMatrixProductRight.S; sourceTree = ""; }; 92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNReluWithSlopeChannel.S; sourceTree = ""; }; - 92FF01B023AA0B4E00AC97F6 /* MNNAddBias.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNAddBias.S; sourceTree = ""; }; 92FF01B223AA0B4E00AC97F6 /* MNNCoefLine.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCoefLine.S; sourceTree = ""; }; 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = ""; }; 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = ""; }; @@ -1223,17 +1211,12 @@ 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = ""; }; 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = ""; }; 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = ""; }; - 92FF01BB23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmFloatUnit_4.S; sourceTree = ""; }; 92FF01BD23AA0B4E00AC97F6 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNAsmGlobal.h; sourceTree = ""; }; - 92FF01BE23AA0B4E00AC97F6 /* CPUReluGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUReluGrad.cpp; sourceTree = ""; }; 92FF01C023AA0B4E00AC97F6 /* CPUArgMax.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUArgMax.hpp; sourceTree = ""; }; - 92FF01C123AA0B4E00AC97F6 /* CPUShape.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUShape.cpp; sourceTree = ""; }; - 92FF01C223AA0B4E00AC97F6 /* CPURank.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPURank.hpp; sourceTree = ""; }; 92FF01C323AA0B4F00AC97F6 /* CPUReduction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUReduction.cpp; sourceTree = ""; }; 92FF01C523AA0B4F00AC97F6 /* CPUGatherND.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUGatherND.cpp; sourceTree = ""; }; 92FF01C623AA0B4F00AC97F6 /* CPUQuantizedAvgPool.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedAvgPool.hpp; sourceTree = ""; }; 92FF01C723AA0B4F00AC97F6 /* CPUGatherND.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherND.hpp; sourceTree = ""; }; - 92FF01CB23AA0B4F00AC97F6 /* CPUTanh.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUTanh.hpp; sourceTree = ""; }; 92FF01CD23AA0B4F00AC97F6 /* CPUSetDiff1D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSetDiff1D.hpp; sourceTree = ""; }; 92FF01CE23AA0B4F00AC97F6 /* CPUCast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUCast.cpp; sourceTree = ""; }; 92FF01D123AA0B4F00AC97F6 /* CPUOneHot.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUOneHot.hpp; sourceTree = ""; }; @@ -1245,7 +1228,6 @@ 92FF01DA23AA0B5000AC97F6 /* CPUAsString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUAsString.cpp; sourceTree = ""; }; 92FF01DB23AA0B5000AC97F6 /* CPUDetectionPostProcess.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDetectionPostProcess.hpp; sourceTree = ""; }; 92FF01DC23AA0B5000AC97F6 /* CPURelu.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPURelu.hpp; sourceTree = ""; }; - 92FF01E023AA0B5000AC97F6 /* CPUShape.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUShape.hpp; sourceTree = ""; }; 92FF01E423AA0B5100AC97F6 /* CPUScale.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUScale.cpp; sourceTree = ""; }; 92FF01E523AA0B5100AC97F6 /* CPUUnravelIndex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUUnravelIndex.cpp; sourceTree = ""; }; 92FF01E623AA0B5100AC97F6 /* CPUResize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUResize.hpp; sourceTree = ""; }; @@ -1262,7 +1244,6 @@ 92FF01F823AA0B5200AC97F6 /* CPUConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUConvolutionDepthwise.cpp; sourceTree = ""; }; 92FF01F923AA0B5200AC97F6 /* CPUROIPooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUROIPooling.hpp; sourceTree = ""; }; 92FF01FA23AA0B5200AC97F6 /* CPUInstanceNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInstanceNorm.hpp; sourceTree = ""; }; - 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSigmoid.cpp; sourceTree = ""; }; 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURelu.cpp; sourceTree = ""; }; 92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDetectionPostProcess.cpp; sourceTree = ""; }; 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSetDiff1D.cpp; sourceTree = ""; }; @@ -1270,12 +1251,10 @@ 92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPULinSpace.cpp; sourceTree = ""; }; 92FF020823AA0B5300AC97F6 /* CPUTensorConvert.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUTensorConvert.hpp; sourceTree = ""; }; 92FF020923AA0B5300AC97F6 /* CPUQuantizedLogistic.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUQuantizedLogistic.hpp; sourceTree = ""; }; - 92FF020A23AA0B5300AC97F6 /* CPUSigmoid.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUSigmoid.hpp; sourceTree = ""; }; 92FF020B23AA0B5300AC97F6 /* CPURange.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURange.cpp; sourceTree = ""; }; 92FF020C23AA0B5500AC97F6 /* CPUUnravelIndex.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUUnravelIndex.hpp; sourceTree = ""; }; 92FF020D23AA0B5500AC97F6 /* CPUEltwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUEltwise.hpp; sourceTree = ""; }; 92FF020E23AA0B5500AC97F6 /* CPUMatrixBandPart.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUMatrixBandPart.hpp; sourceTree = ""; }; - 92FF021023AA0B5500AC97F6 /* CPUPriorbox.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUPriorbox.hpp; sourceTree = ""; }; 92FF021223AA0B5600AC97F6 /* CPUBackend.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUBackend.hpp; sourceTree = ""; }; 92FF021323AA0B5600AC97F6 /* CPUDeconvolution.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDeconvolution.cpp; sourceTree = ""; }; 92FF021423AA0B5600AC97F6 /* CPUQuantizedAdd.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuantizedAdd.cpp; sourceTree = ""; }; @@ -1312,7 +1291,6 @@ 92FF023F23AA0B5600AC97F6 /* Int8FunctionsOpt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Int8FunctionsOpt.h; sourceTree = ""; }; 92FF024023AA0B5600AC97F6 /* DeconvolutionWithStride.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DeconvolutionWithStride.cpp; sourceTree = ""; }; 92FF024123AA0B5600AC97F6 /* ConvolutionTiledExecutor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionTiledExecutor.cpp; sourceTree = ""; }; - 92FF024323AA0B5600AC97F6 /* CPURank.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURank.cpp; sourceTree = ""; }; 92FF024523AA0B5700AC97F6 /* CPUEltwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUEltwise.cpp; sourceTree = ""; }; 92FF024623AA0B5700AC97F6 /* CPUInterp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp.cpp; sourceTree = ""; }; 92FF024723AA0B5700AC97F6 /* CPUReduceJoin.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUReduceJoin.hpp; sourceTree = ""; }; @@ -1402,7 +1380,6 @@ 92FF048C23AA0BFA00AC97F6 /* Execution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = Execution.hpp; sourceTree = ""; }; 92FF048D23AA0BFA00AC97F6 /* Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Backend.cpp; sourceTree = ""; }; 92FF048E23AA0BFA00AC97F6 /* Macro.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Macro.h; sourceTree = ""; }; - 92FF049023AA0BFA00AC97F6 /* DirectedAcyclicGraph.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = DirectedAcyclicGraph.hpp; sourceTree = ""; }; 92FF049223AA0BFA00AC97F6 /* Schedule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Schedule.cpp; sourceTree = ""; }; 92FF049323AA0BFA00AC97F6 /* MNNMemoryUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNMemoryUtils.h; sourceTree = ""; }; 92FF049423AA0BFA00AC97F6 /* TensorUtils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = TensorUtils.hpp; sourceTree = ""; }; @@ -1447,27 +1424,16 @@ EBD484242485FF640083CE95 /* Arm82Interp.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Interp.hpp; path = ../arm82/Arm82Interp.hpp; sourceTree = ""; }; EBD484292485FF650083CE95 /* Arm82Interp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Interp.cpp; path = ../arm82/Arm82Interp.cpp; sourceTree = ""; }; EBECA37A24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_ARMV82_Unit.S; sourceTree = ""; }; - EBECA37C24643D300062C7A3 /* Arm82ConvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82ConvolutionDepthwise.hpp; path = ../arm82/Arm82ConvolutionDepthwise.hpp; sourceTree = ""; }; - EBECA37D24643D300062C7A3 /* Arm82Convolution.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Convolution.cpp; path = ../arm82/Arm82Convolution.cpp; sourceTree = ""; }; - EBECA37E24643D300062C7A3 /* Arm82ConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82ConvolutionDepthwise.cpp; path = ../arm82/Arm82ConvolutionDepthwise.cpp; sourceTree = ""; }; EBECA38024643D300062C7A3 /* Arm82Pooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Pooling.cpp; path = ../arm82/Arm82Pooling.cpp; sourceTree = ""; }; EBECA38124643D310062C7A3 /* Arm82Pooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Pooling.hpp; path = ../arm82/Arm82Pooling.hpp; sourceTree = ""; }; - EBECA38224643D310062C7A3 /* Arm82Convolution3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Convolution3x3.cpp; path = ../arm82/Arm82Convolution3x3.cpp; sourceTree = ""; }; EBECA38324643D310062C7A3 /* Arm82Backend.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Backend.hpp; path = ../arm82/Arm82Backend.hpp; sourceTree = ""; }; EBECA38424643D310062C7A3 /* Arm82Eltwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Eltwise.hpp; path = ../arm82/Arm82Eltwise.hpp; sourceTree = ""; }; EBECA38524643D310062C7A3 /* Arm82Eltwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Eltwise.cpp; path = ../arm82/Arm82Eltwise.cpp; sourceTree = ""; }; EBECA38624643D310062C7A3 /* Arm82Relu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Relu.cpp; path = ../arm82/Arm82Relu.cpp; sourceTree = ""; }; EBECA38724643D310062C7A3 /* Arm82Relu.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Relu.hpp; path = ../arm82/Arm82Relu.hpp; sourceTree = ""; }; - EBECA38824643D310062C7A3 /* Arm82Convolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Convolution.hpp; path = ../arm82/Arm82Convolution.hpp; sourceTree = ""; }; EBECA38924643D310062C7A3 /* Arm82Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Backend.cpp; path = ../arm82/Arm82Backend.cpp; sourceTree = ""; }; - EBECA38A24643D310062C7A3 /* Arm82Convolution3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82Convolution3x3.hpp; path = ../arm82/Arm82Convolution3x3.hpp; sourceTree = ""; }; - EBECA38B24643D310062C7A3 /* Arm82OptFunc.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = Arm82OptFunc.hpp; path = ../arm82/Arm82OptFunc.hpp; sourceTree = ""; }; - EBECA38D24643D320062C7A3 /* Arm82OptFunc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OptFunc.cpp; path = ../arm82/Arm82OptFunc.cpp; sourceTree = ""; }; EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNAsmGlobal.h; path = ../arm82/asm/MNNAsmGlobal.h; sourceTree = ""; }; - EBECA3A224643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNLineDepthWiseFp16C8Unit.S; path = ../arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S; sourceTree = ""; }; EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNQuantizeFP16_UNIT4.S; path = ../arm82/asm/arm64/MNNQuantizeFP16_UNIT4.S; sourceTree = ""; }; - EBECA3A424643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGemmFP16C8_UNIT.S; path = ../arm82/asm/arm64/MNNGemmFP16C8_UNIT.S; sourceTree = ""; }; - EBECA3A524643D5D0062C7A3 /* MNNShuffleChannelC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNShuffleChannelC8.S; path = ../arm82/asm/arm64/MNNShuffleChannelC8.S; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -1597,6 +1563,8 @@ 48747D51245D9E33000B9709 /* geometry */ = { isa = PBXGroup; children = ( + 4D759B2B25FF89EE0037B0B6 /* GeometryShape.cpp */, + 48A046FB25E4ABAC00CFA868 /* GeometryUnary.cpp */, 48BFC50025B84D2700580F9E /* GeometryGather.cpp */, 481FA84E259C27B30047F01F /* GeometryTensorArray.cpp */, 48608B4D250632EC00CB1D71 /* GeometryComputer.cpp */, @@ -1608,7 +1576,6 @@ 48F5880D24DEA3F000C484A2 /* GeometryPooling3D.cpp */, 48417FED24D13BF50056D9A7 /* GeometryELU.cpp */, 48417FEE24D13BF50056D9A7 /* GeometrySelect.cpp */, - 48417FEF24D13BF50056D9A7 /* GeometryTanH.cpp */, 48417FEC24D13BF50056D9A7 /* GeometryThreshold.cpp */, 4819FB3724C69E680050BD09 /* GeometryBatchMatMul.cpp */, 4819FB3824C69E680050BD09 /* GeometryCosineSimilarity.cpp */, @@ -1679,7 +1646,6 @@ 92FF048923AA0BFA00AC97F6 /* BufferAllocator.cpp */, 92FF049A23AA0BFB00AC97F6 /* BufferAllocator.hpp */, 92FF049E23AA0BFB00AC97F6 /* Concurrency.h */, - 92FF049023AA0BFA00AC97F6 /* DirectedAcyclicGraph.hpp */, 92FF049C23AA0BFB00AC97F6 /* Execution.cpp */, 92FF048C23AA0BFA00AC97F6 /* Execution.hpp */, 92FF049D23AA0BFB00AC97F6 /* FileLoader.cpp */, @@ -1707,6 +1673,21 @@ 48887410215B639D0079B12E /* cpu */ = { isa = PBXGroup; children = ( + 4838EA7B2611BFE20027232C /* CPUGridSample.cpp */, + 4838EA7A2611BFE20027232C /* CPUGridSample.hpp */, + 481C2DE625FE2CD6001ED6DF /* Arm82Functions.cpp */, + 481C2DE425FE2CD6001ED6DF /* Arm82Functions.hpp */, + 481C2DE925FE2CD6001ED6DF /* Arm82InstanceNorm.cpp */, + 481C2DE825FE2CD6001ED6DF /* Arm82InstanceNorm.hpp */, + 481C2DEA25FE2CD6001ED6DF /* Arm82Moments.cpp */, + 481C2DE525FE2CD6001ED6DF /* Arm82Moments.hpp */, + 481C2DEB25FE2CD6001ED6DF /* Arm82OptFunc.cpp */, + 481C2DE725FE2CD6001ED6DF /* Arm82OptFunc.hpp */, + 481C2DE225FE2CD5001ED6DF /* Arm82WinogradOptFunc.cpp */, + 481C2DE325FE2CD5001ED6DF /* Arm82WinogradOptFunc.hpp */, + 4896D36425FE2A3C00717702 /* Arm82Unary.cpp */, + 4896D36525FE2A3C00717702 /* Arm82Unary.hpp */, + 4896D36825FE2A3D00717702 /* Arm82Vec.hpp */, 4837147025A599EC004DBDED /* Arm82Binary.cpp */, 4837147125A599EC004DBDED /* Arm82Binary.hpp */, 481FA846259C24A00047F01F /* CPUConvArm82Int8.cpp */, @@ -1726,23 +1707,12 @@ EBD484292485FF650083CE95 /* Arm82Interp.cpp */, EBD484242485FF640083CE95 /* Arm82Interp.hpp */, EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */, - EBECA3A424643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S */, - EBECA3A224643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S */, EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */, - EBECA3A524643D5D0062C7A3 /* MNNShuffleChannelC8.S */, EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */, EBECA38924643D310062C7A3 /* Arm82Backend.cpp */, EBECA38324643D310062C7A3 /* Arm82Backend.hpp */, - EBECA37D24643D300062C7A3 /* Arm82Convolution.cpp */, - EBECA38824643D310062C7A3 /* Arm82Convolution.hpp */, - EBECA38224643D310062C7A3 /* Arm82Convolution3x3.cpp */, - EBECA38A24643D310062C7A3 /* Arm82Convolution3x3.hpp */, - EBECA37E24643D300062C7A3 /* Arm82ConvolutionDepthwise.cpp */, - EBECA37C24643D300062C7A3 /* Arm82ConvolutionDepthwise.hpp */, EBECA38524643D310062C7A3 /* Arm82Eltwise.cpp */, EBECA38424643D310062C7A3 /* Arm82Eltwise.hpp */, - EBECA38D24643D320062C7A3 /* Arm82OptFunc.cpp */, - EBECA38B24643D310062C7A3 /* Arm82OptFunc.hpp */, EBECA38024643D300062C7A3 /* Arm82Pooling.cpp */, EBECA38124643D310062C7A3 /* Arm82Pooling.hpp */, EBECA38624643D310062C7A3 /* Arm82Relu.cpp */, @@ -1815,8 +1785,6 @@ 92FF00F823AA0B4A00AC97F6 /* CPUPool.hpp */, 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */, 92FF00F123AA0B4A00AC97F6 /* CPUPoolInt8.hpp */, - 92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */, - 92FF021023AA0B5500AC97F6 /* CPUPriorbox.hpp */, 92FF012C23AA0B4D00AC97F6 /* CPUProposal.cpp */, 92FF00E423AA0B4900AC97F6 /* CPUProposal.hpp */, 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */, @@ -1834,16 +1802,12 @@ 92FF01EF23AA0B5100AC97F6 /* CPUQuantizedSoftmax.hpp */, 92FF020B23AA0B5300AC97F6 /* CPURange.cpp */, 92FF011123AA0B4C00AC97F6 /* CPURange.hpp */, - 92FF024323AA0B5600AC97F6 /* CPURank.cpp */, - 92FF01C223AA0B4E00AC97F6 /* CPURank.hpp */, 92FF00E523AA0B4900AC97F6 /* CPUReduceJoin.cpp */, 92FF024723AA0B5700AC97F6 /* CPUReduceJoin.hpp */, 92FF01C323AA0B4F00AC97F6 /* CPUReduction.cpp */, 92FF010A23AA0B4B00AC97F6 /* CPUReduction.hpp */, 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */, 92FF01DC23AA0B5000AC97F6 /* CPURelu.hpp */, - 92FF01BE23AA0B4E00AC97F6 /* CPUReluGrad.cpp */, - 92FF011D23AA0B4D00AC97F6 /* CPUReluGrad.hpp */, 92FF01EC23AA0B5100AC97F6 /* CPUResize.cpp */, 92FF01E623AA0B5100AC97F6 /* CPUResize.hpp */, 92FF01EB23AA0B5100AC97F6 /* CPURNNSequenceGRU.cpp */, @@ -1860,16 +1824,6 @@ 92FF00E023AA0B4900AC97F6 /* CPUSelect.hpp */, 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */, 92FF01CD23AA0B4F00AC97F6 /* CPUSetDiff1D.hpp */, - 92FF01C123AA0B4E00AC97F6 /* CPUShape.cpp */, - 92FF01E023AA0B5000AC97F6 /* CPUShape.hpp */, - 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */, - 92FF020A23AA0B5300AC97F6 /* CPUSigmoid.hpp */, - 92FF012023AA0B4D00AC97F6 /* CPUSize.cpp */, - 92FF010123AA0B4B00AC97F6 /* CPUSize.hpp */, - 92FF013723AA0B4E00AC97F6 /* CPUSoftmaxGrad.cpp */, - 92FF010023AA0B4B00AC97F6 /* CPUSoftmaxGrad.hpp */, - 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */, - 92FF01CB23AA0B4F00AC97F6 /* CPUTanh.hpp */, 92FF025223AA0B5900AC97F6 /* CPUTensorConvert.cpp */, 92FF020823AA0B5300AC97F6 /* CPUTensorConvert.hpp */, 92FF011623AA0B4C00AC97F6 /* CPUTFQuantizedConv2D.cpp */, @@ -1911,6 +1865,9 @@ 489D7A152550FDC800AD896A /* metal */ = { isa = PBXGroup; children = ( + 4838EA802611C00B0027232C /* MetalGridSample.hpp */, + 4838EA812611C00B0027232C /* MetalGridSample.metal */, + 4838EA822611C00B0027232C /* MetalGridSample.mm */, 489D7A162550FDC800AD896A /* MetalReLU6.metal */, 489D7A172550FDC800AD896A /* MetalReduction.hpp */, 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */, @@ -2009,7 +1966,6 @@ 48C84B6F250F711600EE7666 /* module */ = { isa = PBXGroup; children = ( - 48C84B70250F711600EE7666 /* FixModule.cpp */, 48C84B71250F711600EE7666 /* PipelineModule.cpp */, 48C84B72250F711600EE7666 /* Module.cpp */, 48C84B73250F711600EE7666 /* WhileModule.hpp */, @@ -2018,7 +1974,6 @@ 48C84B76250F711600EE7666 /* WhileModule.cpp */, 48C84B77250F711600EE7666 /* IfModule.cpp */, 48C84B78250F711600EE7666 /* StaticModule.hpp */, - 48C84B79250F711600EE7666 /* FixModule.hpp */, 48C84B7A250F711600EE7666 /* PipelineModule.hpp */, 48C84B7B250F711600EE7666 /* NN.cpp */, ); @@ -2078,7 +2033,6 @@ 92A4E10221F07C76000B0919 /* AutoStorageTest.cpp */, 92D765B8222819EF00178BE5 /* BackendTest.cpp */, 925702D121EF270D00A2A3CA /* BufferAllocatorTest.cpp */, - 92D765BA222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp */, 92A4E0FB21F05A4F000B0919 /* MemoryUtilsTest.cpp */, 925702F521EF604400A2A3CA /* SizeComputerTest.cpp */, 9200045D21EDBDF600BCE892 /* TensorTest.cpp */, @@ -2120,12 +2074,10 @@ 4882C8D0241A24D800DAC168 /* PadTest.cpp */, 4882C8C1241A24D700DAC168 /* Pool3DTest.cpp */, 4882C8DB241A24D900DAC168 /* PoolGradTest.cpp */, - 4882C8D4241A24D800DAC168 /* ReluGradTest.cpp */, 4882C8C8241A24D700DAC168 /* ScatterNdTest.cpp */, 4882C8D7241A24D900DAC168 /* SetDiff1DTest.cpp */, 4882C8DC241A24D900DAC168 /* ShapeTest.cpp */, 4882C8BD241A24D600DAC168 /* SizeTest.cpp */, - 4882C8C4241A24D700DAC168 /* SoftmaxGradTest.cpp */, 4882C8CB241A24D800DAC168 /* SoftplusTest.cpp */, 4882C8BE241A24D700DAC168 /* SoftsignTest.cpp */, 4882C8CD241A24D800DAC168 /* SpaceToDepthTest.cpp */, @@ -2218,7 +2170,6 @@ 92FF014023AA0B4E00AC97F6 /* MNNAddC4WithStride.S */, 92FF014123AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */, 92FF014223AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */, - 92FF014423AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */, 92FF014523AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */, 92FF014623AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */, 92FF014723AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */, @@ -2226,7 +2177,6 @@ 92FF014923AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */, 92FF014A23AA0B4E00AC97F6 /* MNNUnPackC4.S */, 92FF014B23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */, - 92FF014C23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */, 92FF014D23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */, 92FF014E23AA0B4E00AC97F6 /* MNNPackC4.S */, 92FF014F23AA0B4E00AC97F6 /* MNNMinFloat.S */, @@ -2246,16 +2196,13 @@ 92FF016123AA0B4E00AC97F6 /* MNNPowC8.S */, 92FF016223AA0B4E00AC97F6 /* MNNMatrixAdd.S */, 92FF016323AA0B4E00AC97F6 /* MNNExpC8.S */, - 92FF016423AA0B4E00AC97F6 /* MNNAddBiasRelu.S */, 92FF016523AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */, 92FF016623AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */, 92FF016723AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */, 92FF016823AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */, 92FF016A23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */, - 92FF016B23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */, 92FF016C23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */, 92FF016E23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */, - 92FF016F23AA0B4E00AC97F6 /* MNNAddBias.S */, 92FF017123AA0B4E00AC97F6 /* MNNCoefLine.S */, 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */, 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */, @@ -2264,7 +2211,6 @@ 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */, 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */, 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */, - 92FF017A23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */, ); path = arm32; sourceTree = ""; @@ -2272,6 +2218,14 @@ 92FF017C23AA0B4E00AC97F6 /* arm64 */ = { isa = PBXGroup; children = ( + 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */, + 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */, + 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */, + 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */, + 4896D37025FE2A6A00717702 /* MNNExpFP16.S */, + 4896D37525FE2A6B00717702 /* MNNPackC8FP16.S */, + 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */, + 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */, 11A01A0A258785FB00745FA7 /* MNNVectorTop1Float.S */, 11A01A0B258785FB00745FA7 /* MNNVectorTop1Int32.S */, 48034566254157DF004738E3 /* MNNNV21ToBGRAUnit.S */, @@ -2290,7 +2244,6 @@ 92FF018223AA0B4E00AC97F6 /* MNNAddC4WithStride.S */, 92FF018323AA0B4E00AC97F6 /* MNNQuanToDestUint8.S */, 92FF018423AA0B4E00AC97F6 /* MNNLoadU8AndSum.S */, - 92FF018623AA0B4E00AC97F6 /* MNNAddBiasRelu6.S */, 92FF018723AA0B4E00AC97F6 /* MNNStrassenMergeCFunction.S */, 92FF018823AA0B4E00AC97F6 /* MNNBlitC1ToFloatRGBA.S */, 92FF018923AA0B4E00AC97F6 /* MNNCopyC4WithStride.S */, @@ -2298,7 +2251,6 @@ 92FF018B23AA0B4E00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S */, 92FF018C23AA0B4E00AC97F6 /* MNNUnPackC4.S */, 92FF018D23AA0B4E00AC97F6 /* MNNSamplerC1NearestOpt.S */, - 92FF018E23AA0B4E00AC97F6 /* MNNGemmFloatCommon_4.S */, 92FF018F23AA0B4E00AC97F6 /* MNNNV21ToRGBUnit.S */, 92FF019023AA0B4E00AC97F6 /* MNNPackC4.S */, 92FF019123AA0B4E00AC97F6 /* MNNMinFloat.S */, @@ -2318,16 +2270,13 @@ 92FF01A223AA0B4E00AC97F6 /* MNNPowC8.S */, 92FF01A323AA0B4E00AC97F6 /* MNNMatrixAdd.S */, 92FF01A423AA0B4E00AC97F6 /* MNNExpC8.S */, - 92FF01A523AA0B4E00AC97F6 /* MNNAddBiasRelu.S */, 92FF01A623AA0B4E00AC97F6 /* MNNConvDwF23SourceTransUnit.S */, 92FF01A723AA0B4E00AC97F6 /* MNNWinogradMatrixProductLeft.S */, 92FF01A823AA0B4E00AC97F6 /* MNNDeconvRunForUnitDepthWise.S */, 92FF01A923AA0B4E00AC97F6 /* MNNSamplerC1BilinearOpt.S */, 92FF01AB23AA0B4E00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S */, - 92FF01AC23AA0B4E00AC97F6 /* MNNGemmFloatOne_4.S */, 92FF01AD23AA0B4E00AC97F6 /* MNNWinogradMatrixProductRight.S */, 92FF01AF23AA0B4E00AC97F6 /* MNNReluWithSlopeChannel.S */, - 92FF01B023AA0B4E00AC97F6 /* MNNAddBias.S */, 92FF01B223AA0B4E00AC97F6 /* MNNCoefLine.S */, 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */, 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */, @@ -2336,7 +2285,6 @@ 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */, 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */, 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */, - 92FF01BB23AA0B4E00AC97F6 /* MNNGemmFloatUnit_4.S */, 48F9E54B2493511200E46522 /* MNNPackedMatMul.S */, 48F9E54D2493A0A800E46522 /* MNNPackC4ForMatMul_A.S */, 48FB9DCD24AB080C008E1A2D /* MNNPackC8.S */, @@ -2392,6 +2340,7 @@ EBB38EC621E748B9005F76D7 /* shape */ = { isa = PBXGroup; children = ( + 4838EA8A2611C1310027232C /* ShapeGridSample.cpp */, 481FA852259C27E00047F01F /* ShapeTensorArray.cpp */, 4836CEE4257744120068F6CE /* ShapePlugin.cpp */, 48C84B6B250F709E00EE7666 /* SizeComputer.cpp */, @@ -2483,13 +2432,10 @@ 92FF02B423AA0B5A00AC97F6 /* CPUMoments.hpp in Headers */, C43C822D2518951800A0FF84 /* SkNx_neon.h in Headers */, 489D7AA82550FDC900AD896A /* MetalCast.hpp in Headers */, - 92FF034A23AA0B5A00AC97F6 /* CPUTanh.hpp in Headers */, C43C822F2518951800A0FF84 /* SkNx.h in Headers */, 1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */, - 48C84B8A250F711700EE7666 /* FixModule.hpp in Headers */, 1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */, 1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */, - 92FF028223AA0B5A00AC97F6 /* CPUSoftmaxGrad.hpp in Headers */, 1F501F872397BA5B004E8721 /* Matrix.h in Headers */, 48C84B85250F711700EE7666 /* IfModule.hpp in Headers */, 48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */, @@ -2500,12 +2446,12 @@ C43C8226251894F400A0FF84 /* Matrix.hpp in Headers */, 92FF026E23AA0B5A00AC97F6 /* CPUQuantizationUtils.hpp in Headers */, 92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */, + 4896D36A25FE2A3D00717702 /* Arm82Unary.hpp in Headers */, 489D7A882550FDC900AD896A /* MetalTensorConverter.hpp in Headers */, 1F501F862397BA5B004E8721 /* Rect.h in Headers */, 1F501F8B2397BA5B004E8721 /* MNNSharedContext.h in Headers */, 92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */, 489D7AB02550FDC900AD896A /* MetalDefine.h in Headers */, - 92FF038923AA0B5A00AC97F6 /* CPUSigmoid.hpp in Headers */, 92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */, 1F501F892397BA5B004E8721 /* MNNForwardType.h in Headers */, 92FF027323AA0B5A00AC97F6 /* CPUPoolInt8.hpp in Headers */, @@ -2525,7 +2471,8 @@ 92FF028E23AA0B5A00AC97F6 /* CPULinSpace.hpp in Headers */, 48C84B8F250F711700EE7666 /* Initializer.hpp in Headers */, 92FF038823AA0B5A00AC97F6 /* CPUQuantizedLogistic.hpp in Headers */, - EBECA38E24643D320062C7A3 /* Arm82ConvolutionDepthwise.hpp in Headers */, + 481C2DF225FE2CD6001ED6DF /* Arm82InstanceNorm.hpp in Headers */, + 481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */, EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */, 92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */, 48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */, @@ -2533,6 +2480,7 @@ 92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */, 92FF037823AA0B5A00AC97F6 /* CPUROIPooling.hpp in Headers */, 48747D6D245D9E33000B9709 /* ConvertUtils.hpp in Headers */, + 4838EA832611C00B0027232C /* MetalGridSample.hpp in Headers */, 92FF038723AA0B5A00AC97F6 /* CPUTensorConvert.hpp in Headers */, 92FF036E23AA0B5A00AC97F6 /* CPUQuantizedSoftmax.hpp in Headers */, 92FF04BF23AA0BFB00AC97F6 /* Concurrency.h in Headers */, @@ -2551,22 +2499,23 @@ 92FF028C23AA0B5A00AC97F6 /* CPUReduction.hpp in Headers */, 92FF03B923AA0B5A00AC97F6 /* ConvOpt.h in Headers */, 92FF04AB23AA0BFB00AC97F6 /* Pipeline.hpp in Headers */, + 481C2DEF25FE2CD6001ED6DF /* Arm82Moments.hpp in Headers */, 489D7A6E2550FDC800AD896A /* MetalROIPooling.hpp in Headers */, 4882C8B9241A22B800DAC168 /* ConvolutionCommon.hpp in Headers */, 92FF034623AA0B5A00AC97F6 /* CPUGatherND.hpp in Headers */, - 92FF038F23AA0B5A00AC97F6 /* CPUPriorbox.hpp in Headers */, 92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */, EBECA39524643D320062C7A3 /* Arm82Backend.hpp in Headers */, 92FF04C323AA0BFB00AC97F6 /* Session.hpp in Headers */, 48FA474423AA127B00172C3B /* MergeOptimizer.hpp in Headers */, 92FF039F23AA0B5A00AC97F6 /* CommonOptFunction.h in Headers */, 92FF03BA23AA0B5A00AC97F6 /* ConvolutionWinograd.hpp in Headers */, + 4896D36D25FE2A3D00717702 /* Arm82Vec.hpp in Headers */, 92FF027723AA0B5A00AC97F6 /* CPUUnary.hpp in Headers */, C43C81E02518944F00A0FF84 /* WinogradHelper.hpp in Headers */, 92FF035B23AA0B5A00AC97F6 /* CPURelu.hpp in Headers */, + 481C2DED25FE2CD6001ED6DF /* Arm82WinogradOptFunc.hpp in Headers */, 92FF038D23AA0B5A00AC97F6 /* CPUMatrixBandPart.hpp in Headers */, C43C822E2518951800A0FF84 /* ImageSampler.hpp in Headers */, - EBECA39C24643D320062C7A3 /* Arm82Convolution3x3.hpp in Headers */, 92FF035A23AA0B5A00AC97F6 /* CPUDetectionPostProcess.hpp in Headers */, C43C8200251894BD00A0FF84 /* ThreadPool.hpp in Headers */, 48C84B8E250F711700EE7666 /* RandomGenerator.hpp in Headers */, @@ -2575,10 +2524,8 @@ 92FF025D23AA0B5A00AC97F6 /* CPUInterp.hpp in Headers */, 489D7A8B2550FDC900AD896A /* MetalConvolutionWinograd.hpp in Headers */, 92FF039A23AA0B5A00AC97F6 /* Convolution1x1Strassen.hpp in Headers */, - EBECA39A24643D320062C7A3 /* Arm82Convolution.hpp in Headers */, 92FF029B23AA0B5A00AC97F6 /* CPUScale.hpp in Headers */, 489D7A7B2550FDC800AD896A /* MetalUnary.hpp in Headers */, - 92FF04B123AA0BFB00AC97F6 /* DirectedAcyclicGraph.hpp in Headers */, 92FF036C23AA0B5A00AC97F6 /* CPUConst.hpp in Headers */, 92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */, 92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */, @@ -2603,14 +2550,13 @@ 489D7AB42550FDC900AD896A /* MetalBinary.hpp in Headers */, 92FF04AF23AA0BFB00AC97F6 /* Macro.h in Headers */, 92FF028D23AA0B5A00AC97F6 /* CPUWhere.hpp in Headers */, - 92FF028323AA0B5A00AC97F6 /* CPUSize.hpp in Headers */, 92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */, 92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */, EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */, + 4838EA7C2611BFE20027232C /* CPUGridSample.hpp in Headers */, 92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */, 92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */, 489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */, - 92FF034123AA0B5A00AC97F6 /* CPURank.hpp in Headers */, 92FF03D123AA0B5A00AC97F6 /* CPUTopKV2.hpp in Headers */, EBECA39624643D320062C7A3 /* Arm82Eltwise.hpp in Headers */, 92FF033F23AA0B5A00AC97F6 /* CPUArgMax.hpp in Headers */, @@ -2624,16 +2570,13 @@ 48747D4F245D9E13000B9709 /* CPURaster.hpp in Headers */, 489D7A822550FDC900AD896A /* MetalPReLU.hpp in Headers */, C43C82312518951800A0FF84 /* ImageBlitter.hpp in Headers */, - 92FF029F23AA0B5A00AC97F6 /* CPUReluGrad.hpp in Headers */, 48C84B84250F711700EE7666 /* WhileModule.hpp in Headers */, 92FF02A923AA0B5A00AC97F6 /* CPUCropAndResize.hpp in Headers */, 92FF037923AA0B5A00AC97F6 /* CPUInstanceNorm.hpp in Headers */, 92FF026223AA0B5A00AC97F6 /* CPUSelect.hpp in Headers */, 92FF02B723AA0B5A00AC97F6 /* CPUQuantizedAdd.hpp in Headers */, - EBECA39D24643D320062C7A3 /* Arm82OptFunc.hpp in Headers */, 92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */, 92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */, - 92FF035F23AA0B5A00AC97F6 /* CPUShape.hpp in Headers */, 92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */, 92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */, 48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */, @@ -2658,6 +2601,7 @@ 486E1A9A24F5078D00C16006 /* CPURandomUniform.hpp in Headers */, 92FF038C23AA0B5A00AC97F6 /* CPUEltwise.hpp in Headers */, 92FF028823AA0B5A00AC97F6 /* CPUDequantize.hpp in Headers */, + 481C2DF125FE2CD6001ED6DF /* Arm82OptFunc.hpp in Headers */, C43C8225251894F400A0FF84 /* WingoradGenerater.hpp in Headers */, 489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */, ); @@ -2795,6 +2739,7 @@ 48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */, 92FF031A23AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseInt8.S in Sources */, 92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */, + 481C2DF325FE2CD6001ED6DF /* Arm82InstanceNorm.cpp in Sources */, 92FF02CB23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */, 92FF02C223AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */, 4819FB2E24C1396A0050BD09 /* GeometryLSTM.cpp in Sources */, @@ -2804,9 +2749,11 @@ 489D7AA12550FDC900AD896A /* MetalUnary.mm in Sources */, 92FF037323AA0B5A00AC97F6 /* CPUEltwiseInt8.cpp in Sources */, 489D7AC52550FF9F00AD896A /* ExecutorScope.cpp in Sources */, + 481C2DF025FE2CD6001ED6DF /* Arm82Functions.cpp in Sources */, 92FF042F23AA0B7100AC97F6 /* ShapeSliceTf.cpp in Sources */, 92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */, 489D7A7D2550FDC900AD896A /* MetalConvolution.mm in Sources */, + 4838EA7D2611BFE20027232C /* CPUGridSample.cpp in Sources */, 92FF04B323AA0BFB00AC97F6 /* Schedule.cpp in Sources */, 92FF036423AA0B5A00AC97F6 /* CPUUnravelIndex.cpp in Sources */, 92FF02C623AA0B5A00AC97F6 /* MNNBlitC1ToFloatRGBA.S in Sources */, @@ -2834,10 +2781,12 @@ 92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */, 92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */, 92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */, + 4896D37D25FE2A6B00717702 /* MNNPackC8FP16.S in Sources */, 92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */, 92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */, EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */, 489D7A862550FDC900AD896A /* MetalMatMul.metal in Sources */, + 481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */, 92FF02DA23AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseInt8.S in Sources */, 489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */, 92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */, @@ -2846,6 +2795,7 @@ 48747D50245D9E13000B9709 /* CPURaster.cpp in Sources */, 489D7A782550FDC800AD896A /* MetalEltwise.mm in Sources */, 92FF02FD23AA0B5A00AC97F6 /* MNNScaleAddInt8.S in Sources */, + 4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */, 92FF04A723AA0BFB00AC97F6 /* BackendRegister.cpp in Sources */, 92FF02DF23AA0B5A00AC97F6 /* MNNBilinearProcC1.S in Sources */, 489D7A852550FDC900AD896A /* MetalConvolutionWinograd.metal in Sources */, @@ -2858,7 +2808,6 @@ 48747D6F245D9E33000B9709 /* GeometryConcat.cpp in Sources */, 488F1158247BB2A0008E85C6 /* Arm82Raster.cpp in Sources */, 4819FB3224C1396A0050BD09 /* GeometryReduce.cpp in Sources */, - 92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */, 92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */, 92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */, 92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */, @@ -2874,12 +2823,14 @@ 92FF030323AA0B5A00AC97F6 /* MNNLoadU8AndSum.S in Sources */, 92FF02D223AA0B5A00AC97F6 /* MNNNV21ToRGBAUnit.S in Sources */, 48747D66245D9E33000B9709 /* GeometryDepthToSpace.cpp in Sources */, + 481C2DF425FE2CD6001ED6DF /* Arm82Moments.cpp in Sources */, 481FA853259C27E00047F01F /* ShapeTensorArray.cpp in Sources */, 6A131E3F25823349002EC3D6 /* PluginShapeInference.cpp in Sources */, 92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */, 48034563254157CE004738E3 /* MNNNV21ToBGRAUnit.S in Sources */, 48FA474823AA127B00172C3B /* Expr.cpp in Sources */, - EBECA3A924643D5D0062C7A3 /* MNNShuffleChannelC8.S in Sources */, + 4838EA842611C00B0027232C /* MetalGridSample.metal in Sources */, + 481C2DEC25FE2CD6001ED6DF /* Arm82WinogradOptFunc.cpp in Sources */, 92FF039223AA0B5A00AC97F6 /* CPUDeconvolution.cpp in Sources */, 92FF042923AA0B7100AC97F6 /* ShapeLinSpace.cpp in Sources */, 92FF03A723AA0B5A00AC97F6 /* ConvolutionIntFactory.cpp in Sources */, @@ -2887,7 +2838,6 @@ 4836CEE5257744120068F6CE /* ShapePlugin.cpp in Sources */, 92FF027523AA0B5A00AC97F6 /* CPUConvolution.cpp in Sources */, 48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */, - EBECA3A624643D5D0062C7A3 /* MNNLineDepthWiseFp16C8Unit.S in Sources */, 92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */, 48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */, 92FF03A023AA0B5A00AC97F6 /* ConvolutionWinograd.cpp in Sources */, @@ -2910,8 +2860,9 @@ 92FF044023AA0B7100AC97F6 /* ShapeSlice.cpp in Sources */, 92FF044723AA0B7100AC97F6 /* ShapeSqueeze.cpp in Sources */, 92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */, + 4896D36925FE2A3D00717702 /* Arm82Unary.cpp in Sources */, 92FF043423AA0B7100AC97F6 /* ShapeStridedSlice.cpp in Sources */, - 92FF02EB23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */, + 4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */, 48FA474A23AA127B00172C3B /* Utils.cpp in Sources */, 92FF043F23AA0B7100AC97F6 /* ShapeTensorConvert.cpp in Sources */, 92FF044B23AA0B7100AC97F6 /* ShapeTile.cpp in Sources */, @@ -2946,25 +2897,21 @@ 92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */, 92FF044A23AA0B7100AC97F6 /* ShapeConvolution.cpp in Sources */, 11A01A0D258785FB00745FA7 /* MNNVectorTop1Int32.S in Sources */, - 92FF02FA23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */, 92FF026A23AA0B5A00AC97F6 /* CPUNonMaxSuppressionV2.cpp in Sources */, 92FF045123AA0B7100AC97F6 /* ShapeArgMax.cpp in Sources */, 48F9E54E2493A0A800E46522 /* MNNPackC4ForMatMul_A.S in Sources */, 92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */, 92FF044F23AA0B7100AC97F6 /* ShapeDepthToSpace.cpp in Sources */, 92FF043323AA0B7100AC97F6 /* ShapeCrop.cpp in Sources */, - 92FF02C423AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */, 48F5881324DEA3F000C484A2 /* GeometryConv3D.cpp in Sources */, 4882C8BA241A22B800DAC168 /* OpCommonUtils.cpp in Sources */, 92FF02B523AA0B5A00AC97F6 /* CPUTopKV2.cpp in Sources */, 489D7A742550FDC800AD896A /* MetalConvolutionActivation.metal in Sources */, 92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */, 489D7A872550FDC900AD896A /* MetalOPRegister.mm in Sources */, - 92FF032B23AA0B5A00AC97F6 /* MNNGemmFloatOne_4.S in Sources */, 48FB9DC724A848D0008E1A2D /* MNNPackedMatMul.S in Sources */, 48BFC50125B84D2700580F9E /* GeometryGather.cpp in Sources */, 48FB9DC824A848D0008E1A2D /* MNNPackC4ForMatMul_A.S in Sources */, - 92FF02A223AA0B5A00AC97F6 /* CPUSize.cpp in Sources */, 48C84B6D250F709E00EE7666 /* SizeComputer.cpp in Sources */, 92FF02EE23AA0B5A00AC97F6 /* MNNReluWithSlopeChannel.S in Sources */, 92FF036A23AA0B5A00AC97F6 /* CPURNNSequenceGRU.cpp in Sources */, @@ -2978,7 +2925,6 @@ 92FF02C123AA0B5A00AC97F6 /* MNNQuanToDestUint8.S in Sources */, 489D7A7C2550FDC900AD896A /* MetalBackend.metal in Sources */, 92FF039323AA0B5A00AC97F6 /* CPUQuantizedAdd.cpp in Sources */, - EBECA39F24643D320062C7A3 /* Arm82OptFunc.cpp in Sources */, 92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */, EBECA39824643D320062C7A3 /* Arm82Relu.cpp in Sources */, 92FF043823AA0B7100AC97F6 /* ShapeUnravelIndex.cpp in Sources */, @@ -2989,7 +2935,6 @@ 48747D6C245D9E33000B9709 /* GeometrySpaceToBatchND.cpp in Sources */, 489D7A9A2550FDC900AD896A /* MetalConvolutionCommon.mm in Sources */, 92FF044623AA0B7100AC97F6 /* ShapeInnerProduct.cpp in Sources */, - 92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */, 92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */, 92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */, 92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */, @@ -3000,8 +2945,8 @@ 4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */, 92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */, EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */, + 4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */, 92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */, - 92FF032423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */, 48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */, 489D7A932550FDC900AD896A /* MetalFixedPoint.metal in Sources */, 92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */, @@ -3012,6 +2957,7 @@ 92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */, 92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */, EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */, + 4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */, 11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */, 92FF035323AA0B5A00AC97F6 /* CPUScatterNd.cpp in Sources */, 48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */, @@ -3021,6 +2967,7 @@ 92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */, 92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */, 48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */, + 4838EA8B2611C1310027232C /* ShapeGridSample.cpp in Sources */, 92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */, 92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */, 92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */, @@ -3031,14 +2978,13 @@ 92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */, 92FF042C23AA0B7100AC97F6 /* ShapeReduceJoin.cpp in Sources */, C43C81F32518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */, + 4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */, 92FF043023AA0B7100AC97F6 /* ShapeQuantizedAvgPool.cpp in Sources */, 92FF030623AA0B5A00AC97F6 /* MNNStrassenMergeCFunction.S in Sources */, 92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */, - 92FF02B923AA0B5A00AC97F6 /* CPUSoftmaxGrad.cpp in Sources */, 92FF03BE23AA0B5A00AC97F6 /* DeconvolutionWithStride.cpp in Sources */, 92FF044923AA0B7100AC97F6 /* ShapeGatherND.cpp in Sources */, 489D7AB32550FDC900AD896A /* MetalPReLU.mm in Sources */, - 48C84B81250F711700EE7666 /* FixModule.cpp in Sources */, 489D7AB12550FDC900AD896A /* MetalDefine.metal in Sources */, 48FB9DCE24AB080C008E1A2D /* MNNPackC8.S in Sources */, 92FF02E123AA0B5A00AC97F6 /* MNNPowC8.S in Sources */, @@ -3059,14 +3005,12 @@ 92FF04BA23AA0BFB00AC97F6 /* WrapExecution.cpp in Sources */, 11A01A06258785EA00745FA7 /* MNNVectorTop1Int32.S in Sources */, 48FB9DC124A8445A008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */, - EBECA38F24643D320062C7A3 /* Arm82Convolution.cpp in Sources */, EBD4842F2485FF660083CE95 /* Arm82Interp.cpp in Sources */, 4819FB3B24C69E680050BD09 /* GeometrySpatialProduct.cpp in Sources */, 92FF02DB23AA0B5A00AC97F6 /* MNNScaleAndAddBias.S in Sources */, 92FF034D23AA0B5A00AC97F6 /* CPUCast.cpp in Sources */, 48C84B83250F711700EE7666 /* Module.cpp in Sources */, 92FF030C23AA0B5A00AC97F6 /* MNNSamplerC1NearestOpt.S in Sources */, - 92FF033A23AA0B5A00AC97F6 /* MNNGemmFloatUnit_4.S in Sources */, 92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */, 48C84B80250F711700EE7666 /* Distributions.cpp in Sources */, 92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */, @@ -3087,14 +3031,12 @@ 92FF031823AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */, 92FF039623AA0B5A00AC97F6 /* CPUDepthwiseConvInt8.cpp in Sources */, 92FF04AA23AA0BFB00AC97F6 /* BufferAllocator.cpp in Sources */, - 92FF030523AA0B5A00AC97F6 /* MNNAddBiasRelu6.S in Sources */, 92FF030F23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */, 92FF031D23AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseUint8.S in Sources */, C43C81FA251894A600A0FF84 /* CommonOptFunctionNeon.cpp in Sources */, 92FF030123AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */, 489D7A7A2550FDC800AD896A /* MetalReduction.metal in Sources */, 92FF02E223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */, - 48417FF324D13BF50056D9A7 /* GeometryTanH.cpp in Sources */, 92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */, 92FF031B23AA0B5A00AC97F6 /* MNNScaleAndAddBias.S in Sources */, 92FF02AD23AA0B5A00AC97F6 /* CPUConvInt8.cpp in Sources */, @@ -3107,7 +3049,6 @@ 92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */, 92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */, 92FF043E23AA0B7100AC97F6 /* ShapeBatchToSpaceND.cpp in Sources */, - 92FF030D23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */, 48C84B88250F711700EE7666 /* IfModule.cpp in Sources */, 481FA84F259C27B30047F01F /* GeometryTensorArray.cpp in Sources */, 48C84B86250F711700EE7666 /* StaticModule.cpp in Sources */, @@ -3126,23 +3067,21 @@ 92FF041B23AA0B7100AC97F6 /* ShapeUnpack.cpp in Sources */, 92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */, 4819FB3124C1396A0050BD09 /* GeometryLRN.cpp in Sources */, - 92FF02CC23AA0B5A00AC97F6 /* MNNGemmFloatCommon_4.S in Sources */, 48F9E54C2493511200E46522 /* MNNPackedMatMul.S in Sources */, 92FF026F23AA0B5A00AC97F6 /* CPUInt8ToFloat.cpp in Sources */, - EBECA3A824643D5D0062C7A3 /* MNNGemmFP16C8_UNIT.S in Sources */, 92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */, 92FF045023AA0B7100AC97F6 /* ShapeCropAndResize.cpp in Sources */, 92FF02AB23AA0B5A00AC97F6 /* CPUConst.cpp in Sources */, 92FF03D023AA0B5A00AC97F6 /* CPUTensorConvert.cpp in Sources */, 92FF02C023AA0B5A00AC97F6 /* MNNAddC4WithStride.S in Sources */, 92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */, + 4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */, 92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */, C43C81DE2518944F00A0FF84 /* ConvInt83x3.cpp in Sources */, 92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */, 92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */, 92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */, 92FF02BB23AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Fast.S in Sources */, - 92FF028423AA0B5A00AC97F6 /* CPUPriorbox.cpp in Sources */, 92FF045923AA0B7100AC97F6 /* ShapeRegister.cpp in Sources */, 489D7AB62550FDC900AD896A /* MetalReLU6.mm in Sources */, 48A8A61221D101A700C2B9A7 /* ImageProcess.cpp in Sources */, @@ -3150,11 +3089,12 @@ 92FF045823AA0B7100AC97F6 /* ShapeReduction.cpp in Sources */, 92FF026D23AA0B5A00AC97F6 /* CPUMatrixBandPart.cpp in Sources */, 92FF02A323AA0B5A00AC97F6 /* CPUQuantizedLogistic.cpp in Sources */, - 92FF032F23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */, + 4838EA852611C00B0027232C /* MetalGridSample.mm in Sources */, 489D7AAF2550FDC900AD896A /* MetalConvolutionWinograd.mm in Sources */, 489D7AA02550FDC900AD896A /* MetalCast.metal in Sources */, 48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */, 92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */, + 4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */, EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */, 48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */, 48608B51250632EC00CB1D71 /* GeometryComputer.cpp in Sources */, @@ -3167,17 +3107,12 @@ 92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */, 92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */, 92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */, - EBECA39024643D320062C7A3 /* Arm82ConvolutionDepthwise.cpp in Sources */, 92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */, 92FF045D23AA0B7100AC97F6 /* ShapeCast.cpp in Sources */, 48A8A61421D101A700C2B9A7 /* ImageBlitter.cpp in Sources */, - 92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */, - 92FF02EF23AA0B5A00AC97F6 /* MNNAddBias.S in Sources */, 92FF032223AA0B5A00AC97F6 /* MNNMatrixAdd.S in Sources */, 92FF02D723AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWiseUint8.S in Sources */, 92FF026123AA0B5A00AC97F6 /* CPUCropAndResize.cpp in Sources */, - 92FF03C123AA0B5A00AC97F6 /* CPURank.cpp in Sources */, - EBECA39424643D320062C7A3 /* Arm82Convolution3x3.cpp in Sources */, 48FA474923AA127B00172C3B /* MathOp.cpp in Sources */, 489D7A752550FDC800AD896A /* MetalConvolution.metal in Sources */, 4819FB3C24C69E680050BD09 /* GeometryBatchMatMul.cpp in Sources */, @@ -3188,7 +3123,6 @@ 92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */, 92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */, 92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */, - 92FF033D23AA0B5A00AC97F6 /* CPUReluGrad.cpp in Sources */, 489D7AB72550FDC900AD896A /* MetalEltwise.metal in Sources */, 489D7A762550FDC800AD896A /* MetalReduction.mm in Sources */, 92FF032023AA0B5A00AC97F6 /* MNNMatrixSub.S in Sources */, @@ -3201,6 +3135,7 @@ 92FF02DD23AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseUint8.S in Sources */, 48C84B97250F71E900EE7666 /* CPUBatchMatMul.cpp in Sources */, 92FF026323AA0B5A00AC97F6 /* CPUFloatToInt8.cpp in Sources */, + 48A046FC25E4ABAC00CFA868 /* GeometryUnary.cpp in Sources */, 48C84B82250F711700EE7666 /* PipelineModule.cpp in Sources */, 48FD12BE2466A88D009E9102 /* GeometryImageOp.cpp in Sources */, 92FF035423AA0B5A00AC97F6 /* CPUSelect.cpp in Sources */, @@ -3208,6 +3143,7 @@ 489D7A8F2550FDC900AD896A /* MetalScale.metal in Sources */, 92FF02C923AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */, 92FF032823AA0B5A00AC97F6 /* MNNSamplerC1BilinearOpt.S in Sources */, + 4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */, 92FF031923AA0B5A00AC97F6 /* MNNGemmInt8toFloat32_8x4_Unit.S in Sources */, 92FF044323AA0B7100AC97F6 /* ShapeTopKV2.cpp in Sources */, 92FF02EC23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */, @@ -3222,7 +3158,6 @@ C43C81F42518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S in Sources */, 92FF043C23AA0B7100AC97F6 /* ShapeExpandDims.cpp in Sources */, 92FF045723AA0B7100AC97F6 /* ShapeTranspose.cpp in Sources */, - 92FF02E423AA0B5A00AC97F6 /* MNNAddBiasRelu.S in Sources */, 92FF031023AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */, 489D7A712550FDC800AD896A /* MetalConvolutionDepthwise.metal in Sources */, 92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */, @@ -3238,7 +3173,6 @@ 48FD03462467C64700456AF5 /* MatMulSpeed.cpp in Sources */, 4882C8F1241A24D900DAC168 /* PadTest.cpp in Sources */, 920004B521EDBDF600BCE892 /* BinaryOPTest.cpp in Sources */, - 92D765BD222819EF00178BE5 /* DirectedAcyclicGraphTest.cpp in Sources */, 4829A2D623CC26AE00623BF5 /* MatMulTest.cpp in Sources */, 920004D221EDBE1100BCE892 /* MNNTestSuite.cpp in Sources */, 4882C8F8241A24D900DAC168 /* SetDiff1DTest.cpp in Sources */, @@ -3252,7 +3186,6 @@ 920004D021EDBDF600BCE892 /* PReLUTest.cpp in Sources */, 920004CE21EDBDF600BCE892 /* UnaryTest.cpp in Sources */, 4882C8F9241A24D900DAC168 /* LinSpaceTest.cpp in Sources */, - 4882C8E5241A24D900DAC168 /* SoftmaxGradTest.cpp in Sources */, 4882C8FC241A24D900DAC168 /* PoolGradTest.cpp in Sources */, 920004A921EDBDF600BCE892 /* ReductionTest.cpp in Sources */, 4882C8FB241A24D900DAC168 /* Conv2DBackPropFilterTest.cpp in Sources */, @@ -3306,7 +3239,6 @@ 4882C8F0241A24D900DAC168 /* ExpandDimsTest.cpp in Sources */, 4882C8DD241A24D900DAC168 /* Convolution3DTest.cpp in Sources */, 920004CB21EDBDF600BCE892 /* SpaceToBatchNDTest.cpp in Sources */, - 4882C8F5241A24D900DAC168 /* ReluGradTest.cpp in Sources */, 4829A2D923CC26AE00623BF5 /* ExtraTest.cpp in Sources */, 4882C8F2241A24D900DAC168 /* StackTest.cpp in Sources */, 920004D421EDBE1100BCE892 /* TestUtils.mm in Sources */, diff --git a/pymnn/CMakeLists.txt b/pymnn/CMakeLists.txt index 5990ce15..71b85788 100644 --- a/pymnn/CMakeLists.txt +++ b/pymnn/CMakeLists.txt @@ -1,3 +1,5 @@ +# The CMakeLists.txt be used for PC (Windows, Mac, Linux) and Android + cmake_minimum_required(VERSION 3.4.1) project(mnnpybridge) @@ -9,6 +11,7 @@ option(MNN_OPENGL "Enable OpenGL" OFF) option(MNN_VULKAN "Enable Vulkan" OFF) option(MNN_CUDA "Enable CUDA" OFF) option(MNN_TENSORRT "Enable TensorRT" OFF) +option(MNN_HIAI "Enable Huawei NPU" OFF) option(PYMNN_USE_ALINNPYTHON "based on AliNNPython" ON) option(PYMNN_RUNTIME_CHECK_VM "AliNNPython version (new/old) can be check on runtime" ON) option(PYMNN_NEW_PYTHON "AliNNPython new version (when PYMNN_RUNTIME_CHECK_VM=OFF)" ON) @@ -35,6 +38,15 @@ endif() if(MNN_VULKAN) target_compile_definitions(mnnpybridge PRIVATE MNN_VULKAN) endif() +if(MNN_CUDA) + target_compile_definitions(mnnpybridge PRIVATE MNN_CUDA) +endif() +if(MNN_TENSORRT) + target_compile_definitions(mnnpybridge PRIVATE MNN_TENSORRT) +endif() +if(MNN_HIAI) + target_compile_definitions(mnnpybridge PRIVATE MNN_HIAI) +endif() if(PYMNN_USE_ALINNPYTHON) target_compile_definitions(mnnpybridge PRIVATE PYMNN_USE_ALINNPYTHON) endif() @@ -81,53 +93,66 @@ else() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-stack-protector -std=c++11 -O2 -fvisibility=hidden -fvisibility-inlines-hidden") endif() -set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party") -set(LIB_SUBPATH "") -if(WIN32) - if(NOT MNN_BUILD_SHARED_LIBS) - set(LIB_SUBPATH "Static") - elseif(MNN_WIN_RUNTIME_MT) - set(LIB_SUBPATH "MT") - else() - set(LIB_SUBPATH "MD") - endif() -elseif(APPLE) - if(MNN_BUILD_SHARED_LIBS) - set(LIB_SUBPATH "Dynamic") - else() - set(LIB_SUBPATH "Static") - endif() -endif() -if(CMAKE_BUILD_TYPE MATCHES Debug) - set(LIB_SUBPATH "Debug/${LIB_SUBPATH}") -else() - set(LIB_SUBPATH "Release/${LIB_SUBPATH}") -endif() -if(WIN32) - if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "4") - set(LIB_SUBPATH "x86/${LIB_SUBPATH}") - else() - set(LIB_SUBPATH "x64/${LIB_SUBPATH}") - endif() -endif() - -target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include) if(PYMNN_TRAIN_API) set(MNN_DIR ${CMAKE_CURRENT_LIST_DIR}/..) - target_include_directories(mnnpybridge PRIVATE + target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include) endif() -target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}) -target_link_libraries(mnnpybridge PRIVATE MNN) -if(PYMNN_USE_ALINNPYTHON) - target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include) - target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}) - target_link_libraries(mnnpybridge PRIVATE python) +if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux") + set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party") + set(LIB_SUBPATH "") + if(WIN32) + if(NOT MNN_BUILD_SHARED_LIBS) + set(LIB_SUBPATH "Static") + elseif(MNN_WIN_RUNTIME_MT) + set(LIB_SUBPATH "MT") + else() + set(LIB_SUBPATH "MD") + endif() + elseif(APPLE) + if(MNN_BUILD_SHARED_LIBS) + set(LIB_SUBPATH "Dynamic") + else() + set(LIB_SUBPATH "Static") + endif() + endif() + if(CMAKE_BUILD_TYPE MATCHES Debug) + set(LIB_SUBPATH "Debug/${LIB_SUBPATH}") + else() + set(LIB_SUBPATH "Release/${LIB_SUBPATH}") + endif() + if(WIN32) + if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "4") + set(LIB_SUBPATH "x86/${LIB_SUBPATH}") + else() + set(LIB_SUBPATH "x64/${LIB_SUBPATH}") + endif() + endif() + + target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include) + target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}) + target_link_libraries(mnnpybridge PRIVATE MNN) + + if(PYMNN_USE_ALINNPYTHON) + target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include) + target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}) + target_link_libraries(mnnpybridge PRIVATE python) + endif() + if(PYMNN_NUMPY_USABLE) + target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include) + target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}) + target_link_libraries(mnnpybridge PRIVATE numpy_python) + endif() +else() + target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${MNN_DIR}/pymnn/android/src/main/jniLibs/${ANDROID_ABI}) + target_link_libraries(mnnpybridge PRIVATE log MNN MNN_Express) + if(PYMNN_USE_ALINNPYTHON) + target_link_libraries(mnnpybridge PRIVATE AliNNPython) + endif() + if(PYMNN_NUMPY_USABLE) + target_link_libraries(mnnpybridge PRIVATE numpy_python) + endif() endif() -if(PYMNN_NUMPY_USABLE) - target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include) - target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}) - target_link_libraries(mnnpybridge PRIVATE numpy_python) -endif() \ No newline at end of file diff --git a/pymnn/pip_package/MNN/__init__.py b/pymnn/pip_package/MNN/__init__.py index 5adb497b..a36d74bd 100644 --- a/pymnn/pip_package/MNN/__init__.py +++ b/pymnn/pip_package/MNN/__init__.py @@ -1,3 +1,7 @@ +# version.py be generated by scripts (build_whl.sh on PC, update_mnn_wrapper_assets.sh on mobile) +# so don't worry about this, and don't change it, don't create version.py mannully +from .version import __version__ + _Slice = slice _Int = int _newaxis = None diff --git a/pymnn/pip_package/MNN/expr/__init__.py b/pymnn/pip_package/MNN/expr/__init__.py index e921fc22..ba20efe5 100644 --- a/pymnn/pip_package/MNN/expr/__init__.py +++ b/pymnn/pip_package/MNN/expr/__init__.py @@ -2,32 +2,41 @@ _Int = int _Float = float from _mnncengine._expr import * import _mnncengine._expr as _F -import numpy as np + +_numpy_supported = False +try: + import numpy as np + _numpy_supported = True +except Exception: + print ("Numpy not found. Using MNN without numpy.") + def _to_var(x, to_float=True): - if isinstance(x, np.ndarray): - if to_float: - if x.dtype != np.float32: - x = x.astype(np.float32) - return _F.const(x, x.shape) - if not to_float: - if x.dtype != np.int32: - x = x.astype(np.int32) - return _F.const(x, x.shape, dtype=_F.int) - elif isinstance(x, (list, tuple)) and x: - x = np.array(x) - if to_float: - if x.dtype != np.float32: - x = x.astype(np.float32) - return _F.const(x, x.shape) - if not to_float: - if x.dtype != np.int32: - x = x.astype(np.int32) - return _F.const(x, x.shape, dtype=_F.int) - elif isinstance(x, _Int): - return _F.const(x, [], dtype=_F.int) - elif isinstance(x, _Float): - return _F.const(x, [], dtype=_F.float) - return x + if _numpy_supported: + if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var + if to_float: + if x.dtype != np.float32: + x = x.astype(np.float32) + return _F.const(x, x.shape) + if not to_float: + if x.dtype != np.int32: + x = x.astype(np.int32) + return _F.const(x, x.shape, dtype=_F.int) + elif isinstance(x, (list, tuple)) and x: # convert list and tuple to MNN Var + x = np.array(x) + if to_float: + if x.dtype != np.float32: + x = x.astype(np.float32) + return _F.const(x, x.shape) + if not to_float: + if x.dtype != np.int32: + x = x.astype(np.int32) + return _F.const(x, x.shape, dtype=_F.int) + else: # No numpy support + if isinstance(x, _Int): + return _F.const(x, [], dtype=_F.int) + elif isinstance(x, _Float): + return _F.const(x, [], dtype=_F.float) + return x def scalar(value): if type(value) == type(1): res = _F.const([value], [], _F.NCHW, _F.int) @@ -56,17 +65,17 @@ def square(x): x = _to_var(x) if not isinstance(x, Var): raise RuntimeError("parameter x is not valid") - return _F.square(x) + return _F.square(x) def sqrt(x): x = _to_var(x) if not isinstance(x, Var): raise RuntimeError("parameter x is not valid") - return _F.sqrt(x) + return _F.sqrt(x) def rsqrt(x): x = _to_var(x) if not isinstance(x, Var): raise RuntimeError("parameter x is not valid") - return _F.rsqrt(x) + return _F.rsqrt(x) def exp(x): x = _to_var(x) if not isinstance(x, Var): @@ -101,7 +110,7 @@ def acos(x): x = _to_var(x) if not isinstance(x, Var): raise RuntimeError("parameter x is not valid") - return _F.acos(x) + return _F.acos(x) def atan(x): x = _to_var(x) if not isinstance(x, Var): @@ -231,7 +240,7 @@ def space_to_batch_nd(input, block_shape, paddings): if len(block_shape.shape) != 1: raise RuntimeError("parameter block_shape must be 1-D w/ shape [M]") if len(paddings.shape) != 2 or paddings.shape[-1] != 2: - raise RuntimeError("parameter paddings must be 2-D w/ shape [M, 2]") + raise RuntimeError("parameter paddings must be 2-D w/ shape [M, 2]") return _F.space_to_batch_nd(input, block_shape, paddings) def batch_to_space_nd(input, block_shape, crops): input = _to_var(input) @@ -355,7 +364,7 @@ def stack(values, axis=0): if not isinstance(value, Var): raise RuntimeError("all items in parameter values must be MNN Var type") if value.shape != values[0].shape or value.dtype != values[0].dtype: - raise RuntimeError("all items in parameter values must have same shape and dtype") + raise RuntimeError("all items in parameter values must have same shape and dtype") return _F.stack(values, axis) def slice(input, starts, sizes): input = _to_var(input) @@ -419,7 +428,7 @@ def crop(images, size, axis, offset): raise RuntimeError("parameter offset must be at most 2 if you want to change h/w") if axis == 3: if len(offset) != 1: - raise RuntimeError("parameter offset must be at most 1 if you want to change w only") + raise RuntimeError("parameter offset must be at most 1 if you want to change w only") return _F.crop(images, size, axis, offset) def crop_and_resize(image, boxes, box_ind, crop_size, method=BILINEAR, extrapolation_value=0.): image = _to_var(image) @@ -468,12 +477,12 @@ def reshape(x, shape, original_format=NCHW): if not isinstance(shape, (list, tuple)): raise RuntimeError("parameter shape is not valid") new_length = 1 - skip = False + skip = False for value in shape: if value < 0: skip = True new_length *= value - + if new_length != x.size and not skip: raise RuntimeError("parameter shape is not valid") - return _F.reshape(x, shape, original_format) + return _F.reshape(x, shape, original_format) diff --git a/pymnn/pip_package/MNN/nn/__init__.py b/pymnn/pip_package/MNN/nn/__init__.py index 1f5b1474..0397abdf 100644 --- a/pymnn/pip_package/MNN/nn/__init__.py +++ b/pymnn/pip_package/MNN/nn/__init__.py @@ -7,7 +7,15 @@ import _mnncengine._nn as _nn def load_module_from_file(file_name, input_names, output_names, **kwargs): dynamic = kwargs.get('dynamic', False) shape_mutable = kwargs.get('shape_mutable', False) - module = _nn.load_module_from_file(input_names, output_names, file_name, dynamic, shape_mutable) + rearrange = kwargs.get('rearrange', False) + backend = kwargs.get('backend', _F.Backend.CPU) + memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal) + power_mode = kwargs.get('power_mode', _F.PowerMode.Normal) + precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal) + thread_num = kwargs.get('thread_num', 1) + + module = _nn.load_module_from_file(input_names, output_names, file_name, dynamic, shape_mutable, rearrange, + backend, memory_mode, power_mode, precision_mode, thread_num) return module diff --git a/pymnn/pip_package/MNN/tools/mnnconvert.py b/pymnn/pip_package/MNN/tools/mnnconvert.py index bb5a833b..a4225f45 100644 --- a/pymnn/pip_package/MNN/tools/mnnconvert.py +++ b/pymnn/pip_package/MNN/tools/mnnconvert.py @@ -3,38 +3,23 @@ """ python wrapper file for mnn converter tool """ from __future__ import print_function import os +import sys import argparse import _tools as Tools -def usage(): - """ print usage info """ - print("usage: mnnconvert [-h]") - print(" [--framework {TF,CAFFE,ONNX,TFLITE,MNN}") - print(" [--modelFile MODELFILE]") - print(" [--prototxt PROTOTXT]") - print(" [--MNNModel MNNMODEL]") - print(" [--fp16 {True,False}]") - print(" [--weightQuantBits {num of bits for weight-only-quant, default:0, which means no quant}]") - print(" [--weightQuantAsymmetric {True,False use asymmetric quant method for weight-only-quant, \ - the default method is symmetric quant, which is compatible with old MNN versions. \ - you can set this flag to True use asymmetric quant method to improve accuracy of the weight-quant model in some cases, \ - but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. \ - default: False, which means using SYMMETRIC quant method}]") - print(" [--compressionParamsFile COMPRESSION_PARAMS_PATH]") - def main(): """ main funcion """ - accepted_framework = ['TF', 'CAFFE', 'ONNX', 'TFLITE', 'MNN'] + TF, CAFFE, ONNX, MNN, TFLITE = 0, 1, 2, 3, 4 + framework_map = {'TF': TF, 'CAFFE': CAFFE, 'ONNX': ONNX, 'TFLITE': TFLITE, 'MNN': MNN} + parser = argparse.ArgumentParser() parser.add_argument("-f", "--framework", type=str,\ - choices=['TF', 'CAFFE', 'ONNX', 'TFLITE', 'MNN'], default='TF',\ - required=True, help="model type, for example:TF/CAFFE/ONNX/TFLITE/MNN") + choices=list(framework_map.keys()), default='TF', required=True, help="model type") parser.add_argument("--modelFile", type=str, required=True,\ help="tensorflow Pb or caffeModel, for example:xxx.pb/xxx.caffemodel") - parser.add_argument("--prototxt", type=str,\ - help="only used for caffe, for example: xxx.prototxt") - parser.add_argument("--MNNModel", type=str, required=True,\ - help="MNN model, ex: xxx.mnn") + parser.add_argument("--prototxt", type=str, help="only used for caffe, for example: xxx.prototxt") + parser.add_argument("--MNNModel", type=str, required=True, help="MNN model, ex: xxx.mnn") + parser.add_argument("--bizCode", type=str, required=True, help="bizcode, ex: MNN") parser.add_argument("--fp16", type=bool, default=False,\ help="{True,False}\ Boolean to change the mnn usage. If True, the output\ @@ -45,31 +30,13 @@ def main(): help="The path of model compression file that stores the int8 calibration \ table for quantization or auxiliary parameters for sparsity.") - TF = 0 - CAFFE = 1 - ONNX = 2 - MNN = 3 - TFLITE = 4 args = parser.parse_args() - if args.framework.upper() in accepted_framework: - if args.framework == 'TF': - framework_type = TF - elif args.framework.upper() == 'CAFFE': - framework_type = CAFFE - elif args.framework.upper() == 'ONNX': - framework_type = ONNX - elif args.framework.upper() == 'MNN': - framework_type = MNN - elif args.framework.upper() == 'TFLITE': - framework_type = TFLITE - else: - usage() - return -1 + framework_type = framework_map[args.framework] if args.modelFile is None or not os.path.exists(args.modelFile): print("modelfile not exist") return -1 if args.MNNModel is None: - usage() + parser.print_help(sys.stderr)() return -1 if args.framework.upper() == 'CAFFE': if args.prototxt is None or not os.path.exists(args.prototxt): @@ -86,7 +53,7 @@ def main(): args.compressionParamsFile = "" Tools.mnnconvert(args.MNNModel, args. modelFile, framework_type,\ - args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile) + args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile, args.bizCode) return 0 if __name__ == "__main__": main() diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py index 1d4ba966..faa892ac 100644 --- a/pymnn/pip_package/setup.py +++ b/pymnn/pip_package/setup.py @@ -185,6 +185,7 @@ def configure_extension_build(): tools_include_dirs += [os.path.join(root_dir, "source", "core")] tools_include_dirs += [os.path.join(root_dir, "schema", "current")] tools_include_dirs += [os.path.join(root_dir, "source")] + tools_include_dirs += [np.get_include()] if IS_WINDOWS: tools_include_dirs += [os.path.join(os.environ['Protobuf_SRC_ROOT_FOLDER'], 'src')] @@ -206,7 +207,6 @@ def configure_extension_build(): engine_extra_link_args += ['-Wl,--no-whole-archive'] if IS_WINDOWS: engine_extra_link_args += ['/WHOLEARCHIVE:MNN.lib'] - engine_extra_link_args += ['/WHOLEARCHIVE:MNNTrain.lib'] if IS_DARWIN: tools_extra_link_args += ['-Wl,-all_load'] tools_extra_link_args += tools_depend diff --git a/pymnn/src/MNN.cc b/pymnn/src/MNN.cc index c1da7399..ce60c5d4 100644 --- a/pymnn/src/MNN.cc +++ b/pymnn/src/MNN.cc @@ -5,6 +5,7 @@ */ #include "MNNPyBridge.h" #include "common.h" +#include "util.h" static int tls_key = 0; static int tls_key_2 = 0; @@ -28,8 +29,10 @@ namespace py = pybind11; #include #include #include +//#include #include #include +using namespace MNN::Express; #endif // PYMNN_EXPR_API #ifdef BUILD_OPTYPE @@ -45,15 +48,15 @@ namespace py = pybind11; #include "DataLoader.hpp" #include "Loss.hpp" #include "Transformer.hpp" +#include "PipelineModule.hpp" using namespace MNN::Train; #endif // PYMNN_TRAIN_API #include #include -#include "util.h" using namespace MNN; -using namespace MNN::Express; + using namespace std; struct MNN_TLSData { @@ -598,6 +601,8 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject config.type = MNN_FORWARD_CPU; if (backend) { auto backend_name = object2String(backend); + // Avoid misusing backend not supported by the bridge and corresponding MNN library on python level, + // then user will ask for right version bridge library to us, same like MNN.expr.Backend.* python enum std::unordered_map backend_map = { {"CPU", MNN_FORWARD_CPU}, #ifdef MNN_OPENCL @@ -617,10 +622,14 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject #endif #ifdef MNN_CUDA {"CUDA", MNN_FORWARD_CUDA}, +#endif +#ifdef MNN_HIAI + {"HIAI", MNN_FORWARD_USER_0} #endif }; auto iter = backend_map.find(backend_name); if (iter == backend_map.end()) { + // backend not support, issue on python level when development PyErr_SetString(PyExc_Exception, "PyMNNInterpreter_createSession: backend not support"); return NULL; @@ -1117,8 +1126,8 @@ static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObjec "PyMNNInterpreter_new: PyArg_ParseTuple failed"); return -1; } - - self->modelPath = new std::string(path); + auto converted_path = convertBytesEncodeIfNeed(path); + self->modelPath = new std::string(converted_path.data()); if (!self->modelPath) { PyErr_SetString(PyExc_Exception, "PyMNNInterpreter_new: create modelPath string failed"); @@ -1517,7 +1526,7 @@ static PyObject* PyMNNTensor_getNumpyData(PyMNNTensor *self, PyObject *args) { auto data = self->tensor->host(); obj = PyArray_SimpleNewFromData(npy_dims.size(), npy_dims.data(), NPY_DOUBLE, data); } else { - MNN_PRINT("tensor can not be read as numpy\n"); + PyErr_SetString(PyExc_Exception, "tensor can not be read as numpy"); Py_RETURN_NONE; } return obj; @@ -2142,27 +2151,27 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { #endif if (PyType_Ready(&PyMNNInterpreterType) < 0) { - printf("initMNN: PyType_Ready PyMNNInterpreterType failed"); + PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNInterpreterType failed"); ERROR_RETURN } if (PyType_Ready(&PyMNNSessionType) < 0) { - printf("initMNN: PyType_Ready PyMNNSessionType failed"); + PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNSessionType failed"); ERROR_RETURN } if (PyType_Ready(&PyMNNTensorType) < 0) { - printf("initMNN: PyType_Ready PyMNNTensorType failed"); + PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNTensorType failed"); ERROR_RETURN } if (PyType_Ready(&PyMNNCVImageProcessType) < 0) { - printf("initMNN: PyType_Ready PyMNNCVImageProcessType failed"); + PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNCVImageProcessType failed"); ERROR_RETURN } if (PyType_Ready(&PyMNNCVMatrixType) < 0) { - printf("initMNN: PyType_Ready PyMNNCVMatrixType failed"); + PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNCVMatrixType failed"); ERROR_RETURN } if (PyType_Ready(&PyMNNOpInfoType) < 0) { - printf("initMNN: PyType_Ready PyMNNOpInfoType failed"); + PyErr_SetString(PyExc_Exception, "initMNN: PyType_Ready PyMNNOpInfoType failed"); ERROR_RETURN } #if PY_MAJOR_VERSION >= 3 @@ -2172,12 +2181,12 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { #endif // module import failed! if (!m) { - printf("initMNN: import MNN failed"); + PyErr_SetString(PyExc_Exception, "initMNN: import MNN failed"); ERROR_RETURN } #ifdef PYMNN_NUMPY_USABLE if(_import_array() < 0) { - printf("initMNN: init numpy failed"); + PyErr_SetString(PyExc_Exception, "initMNN: init numpy failed"); ERROR_RETURN } #endif @@ -2614,18 +2623,67 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { exe->gc(Executor::PART); } }); - expr_module.def("set_thread_number", - [](int numberThread) { - if (numberThread < 1) { - numberThread = 1; - } - if (numberThread > 8) { - numberThread = 8; + py::enum_(expr_module, "Backend") + .value("CPU", MNN_FORWARD_CPU) +#ifdef MNN_OPENCL + .value("OPENCL", MNN_FORWARD_OPENCL) +#endif +#ifdef MNN_OPENGL + .value("OPENGL", MNN_FORWARD_OPENGL) +#endif +#ifdef MNN_VULKAN + .value("VULKAN", MNN_FORWARD_VULKAN) +#endif +#ifdef MNN_METAL + .value("METAL", MNN_FORWARD_METAL) +#endif +#ifdef MNN_TENSORRT + .value("TRT", MNN_FORWARD_USER_1) +#endif +#ifdef MNN_CUDA + .value("CUDA", MNN_FORWARD_CUDA) +#endif +#ifdef MNN_HIAI + .value("HIAI", MNN_FORWARD_USER_0) +#endif + .export_values(); + + using MemoryMode = BackendConfig::MemoryMode; + using PowerMode = BackendConfig::PowerMode; + using PrecisionMode = BackendConfig::PrecisionMode; + py::enum_(expr_module, "MemoryMode") + .value("Normal", MemoryMode::Memory_Normal) + .value("High", MemoryMode::Memory_High) + .value("Low", MemoryMode::Memory_Low) + .export_values(); + py::enum_(expr_module, "PowerMode") + .value("Normal", PowerMode::Power_Normal) + .value("High", PowerMode::Power_High) + .value("Low", PowerMode::Power_Low) + .export_values(); + py::enum_(expr_module, "PrecisionMode") + .value("Normal", PrecisionMode::Precision_Normal) + .value("High", PrecisionMode::Precision_High) + .value("Low", PrecisionMode::Precision_Low) + .export_values(); + expr_module.def("set_config", + [](MNNForwardType backend, MemoryMode memory_mode, PowerMode power_mode, PrecisionMode precision_mode, int thread_num) { + if (thread_num < 1 || thread_num > 8) { + PyErr_SetString(PyExc_Exception, "thread_num should bigger than 0 and less than 9"); } + thread_num = std::max(std::min(thread_num, 8), 1); + //auto exe = ExecutorScope::Current(); auto exe = Executor::getGlobalExecutor(); BackendConfig config; - exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, numberThread); - }); + config.memory = memory_mode; + config.power = power_mode; + config.precision = precision_mode; + exe->setGlobalExecutorConfig(backend, config, thread_num); + }, + py::arg("backend")=MNN_FORWARD_CPU, py::arg("memory_mode")=MemoryMode::Memory_Normal, + py::arg("power_mode")=PowerMode::Power_Normal, py::arg("precision_mode")=PrecisionMode::Precision_Normal, + py::arg("thread_num")=1); + //Begin of Math OPS //Unary OPS expr_module.def("sign", &Express::_Sign); @@ -3018,12 +3076,32 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { return Module::extract(inputs, outputs, fortrain); }); nn_module.def("load_module_from_file", [](const vector& inputs, const vector& outputs, - const char* file_name, bool dynamic, bool shape_mutable) -> Module* { - //Module::Config config {dynamic, shape_mutable}; + const char* file_name, bool dynamic, bool shape_mutable, bool rearrange, + MNNForwardType backend, MemoryMode memory_mode, PowerMode power_mode, + PrecisionMode precision_mode, int thread_num) -> Module* { + BackendConfig backend_config; + backend_config.memory = memory_mode; + backend_config.power = power_mode; + backend_config.precision = precision_mode; + + Module::BackendInfo backend_info; + backend_info.type = backend; + backend_info.config = &backend_config; + Module::Config config; config.dynamic = dynamic; config.shapeMutable = shape_mutable; - return Module::load(inputs, outputs, file_name, &config); + config.rearrange = rearrange; + config.backend = &backend_info; + + auto converted_file_name = convertBytesEncodeIfNeed(file_name); + auto m_ptr = Module::load(inputs, outputs, converted_file_name.data(), &config); + if (m_ptr == nullptr) { + std::string mnn_errno = "load_module_from_file failed "; + mnn_errno = mnn_errno + std::string(file_name); + PyErr_SetString(PyExc_Exception, mnn_errno.c_str()); + } + return m_ptr; }); // CNN @@ -3188,11 +3266,11 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { .value("MAXIMUM", NN::Maximum) .value("MOVING_AVERAGE", NN::MovingAverage) .export_values(); -// compress_module.def("train_quant", &PipelineModule::turnQuantize, -// py::arg("module"), -// py::arg("quant_bits") = 8, -// py::arg("feature_scale_method") = NN::FeatureScaleStatMethod::PerTensor, -// py::arg("scale_update_method") = NN::ScaleUpdateMethod::MovingAverage); + compress_module.def("train_quant", &PipelineModule::turnQuantize, + py::arg("module"), + py::arg("quant_bits") = 8, + py::arg("feature_scale_method") = NN::FeatureScaleStatMethod::PerTensor, + py::arg("scale_update_method") = NN::ScaleUpdateMethod::MovingAverage); } // End of Train #endif diff --git a/pymnn/src/MNNTools.cc b/pymnn/src/MNNTools.cc index b105a28a..d597116a 100644 --- a/pymnn/src/MNNTools.cc +++ b/pymnn/src/MNNTools.cc @@ -3,7 +3,7 @@ */ #include #include "structmember.h" - +#include "util.h" #include "MNN_generated.h" #include "PostConverter.hpp" #include "addBizCode.hpp" @@ -13,7 +13,6 @@ #include "tensorflowConverter.hpp" #include "writeFb.hpp" #include "config.hpp" -#include "options.hpp" #include "common/Global.hpp" #include "calibration.hpp" #include "logkit.h" @@ -27,48 +26,48 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) { const char* modelFile = NULL; const char* compressionParamsFile = NULL; const char* prototxtFile = NULL; + const char* bizCode = NULL; PyObject* frameworkType = NULL; PyObject* fp16 = NULL; PyObject* weightQuantBits = NULL; PyObject* weightQuantAsymmetric = NULL; - if (!PyArg_ParseTuple(args, "ssOO|sOOs", &mnnModel, &modelFile, + if (!PyArg_ParseTuple(args, "ssOO|sOOss", &mnnModel, &modelFile, &frameworkType, &fp16, &prototxtFile, - &weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile)) { + &weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile, + &bizCode)) { return NULL; } struct modelConfig modelPath; - modelPath.MNNModel = std::string(mnnModel); - modelPath.modelFile = std::string(modelFile); + modelPath.MNNModel = convertBytesEncodeIfNeed(mnnModel); + modelPath.modelFile = convertBytesEncodeIfNeed(modelFile); modelPath.model = static_cast(PyLong_AsLong(frameworkType)); - modelPath.bizCode = std::string(""); + modelPath.bizCode = std::string(bizCode); modelPath.benchmarkModel = false; modelPath.saveHalfFloat = static_cast(PyLong_AsLong(fp16)); modelPath.forTraining = false; modelPath.weightQuantBits = static_cast(PyLong_AsLong(weightQuantBits)); modelPath.weightQuantAsymmetric = static_cast(PyLong_AsLong(weightQuantAsymmetric)); if(prototxtFile){ - modelPath.prototxtFile = std::string(prototxtFile); + modelPath.prototxtFile = convertBytesEncodeIfNeed(prototxtFile); } - common::Options options; if (compressionParamsFile) { - modelPath.compressionParamsFile = std::string(compressionParamsFile); - options = common::BuildOptions(modelPath.compressionParamsFile); + modelPath.compressionParamsFile = convertBytesEncodeIfNeed(compressionParamsFile); } Global::Reset(&modelPath); std::unique_ptr netT = std::unique_ptr(new MNN::NetT()); if (modelPath.model == modelConfig::CAFFE) { - caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, options, netT); + caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::TENSORFLOW) { - tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::MNN) { - addBizCode(modelPath.modelFile, modelPath.bizCode, options, netT); + addBizCode(modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::ONNX) { - onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::TFLITE) { - tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); } else { std::cout << "Not Support Model Type" << std::endl; } diff --git a/pymnn/src/common.h b/pymnn/src/common.h index 12707320..648f5821 100644 --- a/pymnn/src/common.h +++ b/pymnn/src/common.h @@ -50,4 +50,4 @@ static int global_new_python_flag = 0; #include #include "structmember.h" #include "numpy/arrayobject.h" -#endif // PYMNN_USE_ALINNPYTHON \ No newline at end of file +#endif // PYMNN_USE_ALINNPYTHON diff --git a/pymnn/src/util.h b/pymnn/src/util.h index d239b444..738f3a78 100644 --- a/pymnn/src/util.h +++ b/pymnn/src/util.h @@ -1,10 +1,44 @@ #pragma once #include +#include #include +#include +#if defined(_MSC_VER) && PY_MAJOR_VERSION >= 3 +#include +#include +#endif #include "common.h" using namespace std; typedef vector INTS; + +// In python3, default str is unicode, then be transformed to UTF-8 bytes by pybind. +// In Windows, MNN library assume input bytes be encoded by CP_ACP. +// So we need: UTF-8 bytes -> unicodes -> CP_ACP bytes +inline std::string convertBytesEncodeIfNeed(const char* srcBytes) { +#if defined(_MSC_VER) && PY_MAJOR_VERSION >= 3 + int wideCharSize = MultiByteToWideChar(CP_UTF8, 0, srcBytes, -1, nullptr, 0); + if (wideCharSize == 0) { + return {}; + } + std::unique_ptr unicodes(new wchar_t[wideCharSize]); + if (MultiByteToWideChar(CP_UTF8, 0, srcBytes, -1, unicodes.get(), wideCharSize) == 0) { + return {}; + } + int byteSize = WideCharToMultiByte(CP_ACP, 0, unicodes.get(), wideCharSize, nullptr, 0, nullptr, nullptr); + if (byteSize == 0) { + return {}; + } + std::unique_ptr dstBytes(new char[byteSize]); + if (WideCharToMultiByte(CP_ACP, 0, unicodes.get(), wideCharSize, dstBytes.get(), byteSize, nullptr, nullptr) == 0) { + return {}; + } + return {dstBytes.get(), (size_t)byteSize}; +#else + return {srcBytes}; +#endif +} + // Returns true if obj is a bytes/str or unicode object inline bool checkString(PyObject* obj) { return PyBytes_Check(obj) || PyUnicode_Check(obj); diff --git a/pymnn/update_mnn_wrapper_assets.sh b/pymnn/update_mnn_wrapper_assets.sh index 1d8c2b4f..afea6506 100755 --- a/pymnn/update_mnn_wrapper_assets.sh +++ b/pymnn/update_mnn_wrapper_assets.sh @@ -1,15 +1,18 @@ +#!/bin/bash set -e usage() { - echo "Usage: $0 -p python_version [-t]" - echo -e "\t-p python versions in pyenv" + echo "Usage: $0 -p python_version -v mnn_version [-t]" + echo -e "\t-p python versions in pyenv [only support 2.x]" + echo -e "\t-v MNN version to set" echo -e "\t-t include train API wrapper" exit 1 } -while getopts "p:t" opt; do +while getopts "p:v:t" opt; do case "$opt" in p ) py_version=$OPTARG ;; + v ) mnn_version=$OPTARG ;; t ) train_api=true ;; * ) usage ;; esac @@ -20,6 +23,7 @@ cp -r pip_package/MNN /tmp/mnn_py pushd /tmp/mnn_py/MNN rm -rf tools +echo -e "__version__ = '$mnn_version'" > version.py cat __init__.py | sed '/from . import tools/d' > __init__.py.tmp mv __init__.py.tmp __init__.py @@ -32,14 +36,41 @@ fi find . -name __pycache__ | xargs rm -rf pyenv global $py_version python -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)" -find . -name *.py | xargs rm -rf +find . -name "*.py" | xargs rm -rf cd .. zip -r MNN.zip MNN popd -rm -f android/src/main/assets/MNN.zip -rm -rf iOS/MNNPyBridge/lib/MNN -cp /tmp/mnn_py/MNN.zip android/src/main/assets -cp -r /tmp/mnn_py/MNN iOS/MNNPyBridge/lib +# update wrapper assets from $1 to $2 when pyc (WITHOUT METADATA) is not same +should_update () { + pushd $1 + pyc_files_1=(`find MNN -name *.pyc | sort`) + popd + pushd $2 + pyc_files_2=(`find MNN -name *.pyc | sort`) + popd + if [ ${#pyc_files_1[@]} -ne ${#pyc_files_2[@]} ]; then + return 0 + fi + for ((i=0;i<${#pyc_files_1[@]};i++)); do + if [ ${pyc_files_1[i]} != ${pyc_files_2[i]} ]; then + return 0 + fi + pyc_file=${pyc_files_1[i]} + sum_old=`tail -c +8 $2/$pyc_file | md5sum | awk '{print $1}'` + sum_new=`tail -c +8 $1/$pyc_file | md5sum | awk '{print $1}'` + if [ $sum_old != $sum_new ]; then + return 0 + fi + done + return 1 +} + +if should_update /tmp/mnn_py iOS/MNNPyBridge/lib; then + rm -f android/src/main/assets/MNN.zip + rm -rf iOS/MNNPyBridge/lib/MNN + cp /tmp/mnn_py/MNN.zip android/src/main/assets + cp -r /tmp/mnn_py/MNN iOS/MNNPyBridge/lib +fi rm -rf /tmp/mnn_py diff --git a/release_scripts/publish2hub.sh b/release_scripts/publish2hub.sh deleted file mode 100755 index 3fb7a100..00000000 --- a/release_scripts/publish2hub.sh +++ /dev/null @@ -1,35 +0,0 @@ -# Copies from the files from Gitlab AliNN/MNN to Github MNN repo, -# and remove some internal files. -# This scripts assumes: -# 1. the current directory is the parent directory of "MNN" -# 2. the current directory contains the "GithubMNN" directory - -SOURCE="MNN" -TARGET="GithubMNN" - -# check dirs -if [ ! -d $SOURCE ]; then - echo "$SOURCE Not Found" - exit -1 -fi -if [ ! -d $TARGET ]; then - echo "$TARGET Not Found" - exit -1 -fi - -# remove files except .git in $TARGET -pushd $TARGET > /dev/null -ls | grep -v .git | xargs rm -rf -rm -f .gitignore -popd > /dev/null - -# copy files from $SOURCE to $TARGET -pushd $SOURCE > /dev/null -ls | grep -v .git | xargs -I {} cp -af {} ../$TARGET -cp -f .gitignore ../$TARGET -popd > /dev/null - -# reverting files -pushd $TARGET > /dev/null -# git clean -df -popd > /dev/null diff --git a/release_scripts/publish2lab.sh b/release_scripts/publish2lab.sh deleted file mode 100755 index c6ba2dbf..00000000 --- a/release_scripts/publish2lab.sh +++ /dev/null @@ -1,63 +0,0 @@ -# Copies from the files from Gitlab AliNN/AliNNPrivate to Gitlab AliNN/MNN repo, -# and remove some internal files. -# This scripts assumes: -# 1. the current directory is the parent directory of "AliNNPrivate" -# 2. the current directory contains the "MNN" directory - -SOURCE="AliNNPrivate" -TARGET="MNN" - -# check dirs -if [ ! -d $SOURCE ]; then - echo "$SOURCE Not Found" - exit -1 -fi -if [ ! -d $TARGET ]; then - echo "$TARGET Not Found" - exit -1 -fi - -# remove files except .git in $TARGET -pushd $TARGET > /dev/null -ls | grep -v .git | xargs rm -rf -rm -f .gitignore -popd > /dev/null - -# copy files from $SOURCE to $TARGET -pushd $SOURCE > /dev/null -# Remove gitignored and untracked files. -git clean -df - -ls | grep -v .git | xargs -I {} cp -af {} ../$TARGET -cp -f .gitignore ../$TARGET -rm -rf ../$TARGET/release_scripts -rm -rf ../$TARGET/pymnn/android -rm -rf ../$TARGET/pymnn/iOS -rm -f ../$TARGET/pymnn/renameForAliNNPython.h -rm -f ../$TARGET/pymnn/src/private_define.h -rm -f ../$TARGET/pymnn/src/renameForAliNNPython.h -rm -f ../$TARGET/pymnn/MNNBridge.podspec -rm -f ../$TARGET/source/backend/hiai/3rdParty -popd > /dev/null - -# reverting files -pushd $TARGET > /dev/null -git checkout -- benchmark/models/*.mnn -git checkout -- project/android/build.gradle -popd > /dev/null - -# try re-build -pushd $TARGET > /dev/null - -# MNN -rm -rf build -rm -rf schema/private -rm -rf schema/current - -./schema/generate.sh -mkdir build && cd build -cmake .. -DMNN_BUILD_TEST=true -DMNN_BUILD_CONVERTER=true -DMNN_BUILD_QUANTOOLS=true -make -j4 -./run_test.out - -popd > /dev/null diff --git a/schema/current/MNN_generated.h b/schema/current/MNN_generated.h index 3f08749d..c1362e5a 100644 --- a/schema/current/MNN_generated.h +++ b/schema/current/MNN_generated.h @@ -45,6 +45,9 @@ struct TensorDescribeT; struct SubGraphProto; struct SubGraphProtoT; +struct TensorQuantInfo; +struct TensorQuantInfoT; + struct Net; struct NetT; @@ -68,6 +71,8 @@ inline const flatbuffers::TypeTable *TensorDescribeTypeTable(); inline const flatbuffers::TypeTable *SubGraphProtoTypeTable(); +inline const flatbuffers::TypeTable *TensorQuantInfoTypeTable(); + inline const flatbuffers::TypeTable *NetTypeTable(); enum OpType { @@ -207,6 +212,7 @@ enum OpType { OpType_TensorArraySplit = 139, OpType_TensorArrayConcat = 140, OpType_LSTMBlockCell = 141, + OpType_Reverse = 142, OpType_Plugin = 256, OpType_Select = 257, OpType_ZerosLike = 258, @@ -230,11 +236,12 @@ enum OpType { OpType_While = 600, OpType_If = 601, OpType_LayerNorm = 603, + OpType_GridSample = 604, OpType_MIN = OpType_AbsVal, - OpType_MAX = OpType_LayerNorm + OpType_MAX = OpType_GridSample }; -inline const OpType (&EnumValuesOpType())[159] { +inline const OpType (&EnumValuesOpType())[161] { static const OpType values[] = { OpType_AbsVal, OpType_QuantizedAdd, @@ -372,6 +379,7 @@ inline const OpType (&EnumValuesOpType())[159] { OpType_TensorArraySplit, OpType_TensorArrayConcat, OpType_LSTMBlockCell, + OpType_Reverse, OpType_Plugin, OpType_Select, OpType_ZerosLike, @@ -394,7 +402,8 @@ inline const OpType (&EnumValuesOpType())[159] { OpType_EltwiseInt8, OpType_While, OpType_If, - OpType_LayerNorm + OpType_LayerNorm, + OpType_GridSample }; return values; } @@ -543,7 +552,7 @@ inline const char * const *EnumNamesOpType() { "TensorArraySplit", "TensorArrayConcat", "LSTMBlockCell", - "", + "Reverse", "", "", "", @@ -1005,13 +1014,14 @@ inline const char * const *EnumNamesOpType() { "If", "", "LayerNorm", + "GridSample", nullptr }; return names; } inline const char *EnumNameOpType(OpType e) { - if (e < OpType_AbsVal || e > OpType_LayerNorm) return ""; + if (e < OpType_AbsVal || e > OpType_GridSample) return ""; const size_t index = static_cast(e); return EnumNamesOpType()[index]; } @@ -1108,11 +1118,12 @@ enum OpParameter { OpParameter_LayerNorm = 88, OpParameter_TensorArray = 89, OpParameter_LSTMBlockCell = 90, + OpParameter_GridSample = 91, OpParameter_MIN = OpParameter_NONE, - OpParameter_MAX = OpParameter_LSTMBlockCell + OpParameter_MAX = OpParameter_GridSample }; -inline const OpParameter (&EnumValuesOpParameter())[91] { +inline const OpParameter (&EnumValuesOpParameter())[92] { static const OpParameter values[] = { OpParameter_NONE, OpParameter_QuantizedAdd, @@ -1204,7 +1215,8 @@ inline const OpParameter (&EnumValuesOpParameter())[91] { OpParameter_RandomUniform, OpParameter_LayerNorm, OpParameter_TensorArray, - OpParameter_LSTMBlockCell + OpParameter_LSTMBlockCell, + OpParameter_GridSample }; return values; } @@ -1302,13 +1314,14 @@ inline const char * const *EnumNamesOpParameter() { "LayerNorm", "TensorArray", "LSTMBlockCell", + "GridSample", nullptr }; return names; } inline const char *EnumNameOpParameter(OpParameter e) { - if (e < OpParameter_NONE || e > OpParameter_LSTMBlockCell) return ""; + if (e < OpParameter_NONE || e > OpParameter_GridSample) return ""; const size_t index = static_cast(e); return EnumNamesOpParameter()[index]; } @@ -1677,6 +1690,10 @@ template<> struct OpParameterTraits { static const OpParameter enum_value = OpParameter_LSTMBlockCell; }; +template<> struct OpParameterTraits { + static const OpParameter enum_value = OpParameter_GridSample; +}; + struct OpParameterUnion { OpParameter type; void *value; @@ -2428,6 +2445,14 @@ struct OpParameterUnion { return type == OpParameter_LSTMBlockCell ? reinterpret_cast(value) : nullptr; } + GridSampleT *AsGridSample() { + return type == OpParameter_GridSample ? + reinterpret_cast(value) : nullptr; + } + const GridSampleT *AsGridSample() const { + return type == OpParameter_GridSample ? + reinterpret_cast(value) : nullptr; + } }; bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj, OpParameter type); @@ -3316,6 +3341,9 @@ struct Op FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const LSTMBlockCell *main_as_LSTMBlockCell() const { return main_type() == OpParameter_LSTMBlockCell ? static_cast(main()) : nullptr; } + const GridSample *main_as_GridSample() const { + return main_type() == OpParameter_GridSample ? static_cast(main()) : nullptr; + } const flatbuffers::String *name() const { return GetPointer(VT_NAME); } @@ -3708,6 +3736,10 @@ template<> inline const LSTMBlockCell *Op::main_as() const { return main_as_LSTMBlockCell(); } +template<> inline const GridSample *Op::main_as() const { + return main_as_GridSample(); +} + struct OpBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; @@ -3983,6 +4015,7 @@ struct TensorDescribeT : public flatbuffers::NativeTable { int32_t index; std::string name; std::vector> regions; + std::unique_ptr quantInfo; TensorDescribeT() : index(0) { } @@ -3997,7 +4030,8 @@ struct TensorDescribe FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_BLOB = 4, VT_INDEX = 6, VT_NAME = 8, - VT_REGIONS = 10 + VT_REGIONS = 10, + VT_QUANTINFO = 12 }; const Blob *blob() const { return GetPointer(VT_BLOB); @@ -4011,6 +4045,9 @@ struct TensorDescribe FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const flatbuffers::Vector> *regions() const { return GetPointer> *>(VT_REGIONS); } + const TensorQuantInfo *quantInfo() const { + return GetPointer(VT_QUANTINFO); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_BLOB) && @@ -4021,6 +4058,8 @@ struct TensorDescribe FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_REGIONS) && verifier.VerifyVector(regions()) && verifier.VerifyVectorOfTables(regions()) && + VerifyOffset(verifier, VT_QUANTINFO) && + verifier.VerifyTable(quantInfo()) && verifier.EndTable(); } TensorDescribeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -4043,6 +4082,9 @@ struct TensorDescribeBuilder { void add_regions(flatbuffers::Offset>> regions) { fbb_.AddOffset(TensorDescribe::VT_REGIONS, regions); } + void add_quantInfo(flatbuffers::Offset quantInfo) { + fbb_.AddOffset(TensorDescribe::VT_QUANTINFO, quantInfo); + } explicit TensorDescribeBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -4060,8 +4102,10 @@ inline flatbuffers::Offset CreateTensorDescribe( flatbuffers::Offset blob = 0, int32_t index = 0, flatbuffers::Offset name = 0, - flatbuffers::Offset>> regions = 0) { + flatbuffers::Offset>> regions = 0, + flatbuffers::Offset quantInfo = 0) { TensorDescribeBuilder builder_(_fbb); + builder_.add_quantInfo(quantInfo); builder_.add_regions(regions); builder_.add_name(name); builder_.add_index(index); @@ -4074,7 +4118,8 @@ inline flatbuffers::Offset CreateTensorDescribeDirect( flatbuffers::Offset blob = 0, int32_t index = 0, const char *name = nullptr, - const std::vector> *regions = nullptr) { + const std::vector> *regions = nullptr, + flatbuffers::Offset quantInfo = 0) { auto name__ = name ? _fbb.CreateString(name) : 0; auto regions__ = regions ? _fbb.CreateVector>(*regions) : 0; return MNN::CreateTensorDescribe( @@ -4082,7 +4127,8 @@ inline flatbuffers::Offset CreateTensorDescribeDirect( blob, index, name__, - regions__); + regions__, + quantInfo); } flatbuffers::Offset CreateTensorDescribe(flatbuffers::FlatBufferBuilder &_fbb, const TensorDescribeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); @@ -4094,6 +4140,7 @@ struct SubGraphProtoT : public flatbuffers::NativeTable { std::vector outputs; std::vector tensors; std::vector> nodes; + std::vector> extraTensorDescribe; SubGraphProtoT() { } }; @@ -4108,7 +4155,8 @@ struct SubGraphProto FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_INPUTS = 6, VT_OUTPUTS = 8, VT_TENSORS = 10, - VT_NODES = 12 + VT_NODES = 12, + VT_EXTRATENSORDESCRIBE = 14 }; const flatbuffers::String *name() const { return GetPointer(VT_NAME); @@ -4125,6 +4173,9 @@ struct SubGraphProto FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const flatbuffers::Vector> *nodes() const { return GetPointer> *>(VT_NODES); } + const flatbuffers::Vector> *extraTensorDescribe() const { + return GetPointer> *>(VT_EXTRATENSORDESCRIBE); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && @@ -4139,6 +4190,9 @@ struct SubGraphProto FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) && verifier.VerifyVectorOfTables(nodes()) && + VerifyOffset(verifier, VT_EXTRATENSORDESCRIBE) && + verifier.VerifyVector(extraTensorDescribe()) && + verifier.VerifyVectorOfTables(extraTensorDescribe()) && verifier.EndTable(); } SubGraphProtoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -4164,6 +4218,9 @@ struct SubGraphProtoBuilder { void add_nodes(flatbuffers::Offset>> nodes) { fbb_.AddOffset(SubGraphProto::VT_NODES, nodes); } + void add_extraTensorDescribe(flatbuffers::Offset>> extraTensorDescribe) { + fbb_.AddOffset(SubGraphProto::VT_EXTRATENSORDESCRIBE, extraTensorDescribe); + } explicit SubGraphProtoBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -4182,8 +4239,10 @@ inline flatbuffers::Offset CreateSubGraphProto( flatbuffers::Offset> inputs = 0, flatbuffers::Offset> outputs = 0, flatbuffers::Offset>> tensors = 0, - flatbuffers::Offset>> nodes = 0) { + flatbuffers::Offset>> nodes = 0, + flatbuffers::Offset>> extraTensorDescribe = 0) { SubGraphProtoBuilder builder_(_fbb); + builder_.add_extraTensorDescribe(extraTensorDescribe); builder_.add_nodes(nodes); builder_.add_tensors(tensors); builder_.add_outputs(outputs); @@ -4198,23 +4257,131 @@ inline flatbuffers::Offset CreateSubGraphProtoDirect( const std::vector *inputs = nullptr, const std::vector *outputs = nullptr, const std::vector> *tensors = nullptr, - const std::vector> *nodes = nullptr) { + const std::vector> *nodes = nullptr, + const std::vector> *extraTensorDescribe = nullptr) { auto name__ = name ? _fbb.CreateString(name) : 0; auto inputs__ = inputs ? _fbb.CreateVector(*inputs) : 0; auto outputs__ = outputs ? _fbb.CreateVector(*outputs) : 0; auto tensors__ = tensors ? _fbb.CreateVector>(*tensors) : 0; auto nodes__ = nodes ? _fbb.CreateVector>(*nodes) : 0; + auto extraTensorDescribe__ = extraTensorDescribe ? _fbb.CreateVector>(*extraTensorDescribe) : 0; return MNN::CreateSubGraphProto( _fbb, name__, inputs__, outputs__, tensors__, - nodes__); + nodes__, + extraTensorDescribe__); } flatbuffers::Offset CreateSubGraphProto(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphProtoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct TensorQuantInfoT : public flatbuffers::NativeTable { + typedef TensorQuantInfo TableType; + float scale; + float zero; + float min; + float max; + DataType type; + TensorQuantInfoT() + : scale(0.0f), + zero(0.0f), + min(-128.0f), + max(127.0f), + type(DataType_DT_INVALID) { + } +}; + +struct TensorQuantInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef TensorQuantInfoT NativeTableType; + static const flatbuffers::TypeTable *MiniReflectTypeTable() { + return TensorQuantInfoTypeTable(); + } + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_SCALE = 4, + VT_ZERO = 6, + VT_MIN = 8, + VT_MAX = 10, + VT_TYPE = 12 + }; + float scale() const { + return GetField(VT_SCALE, 0.0f); + } + float zero() const { + return GetField(VT_ZERO, 0.0f); + } + float min() const { + return GetField(VT_MIN, -128.0f); + } + float max() const { + return GetField(VT_MAX, 127.0f); + } + DataType type() const { + return static_cast(GetField(VT_TYPE, 0)); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_SCALE) && + VerifyField(verifier, VT_ZERO) && + VerifyField(verifier, VT_MIN) && + VerifyField(verifier, VT_MAX) && + VerifyField(verifier, VT_TYPE) && + verifier.EndTable(); + } + TensorQuantInfoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(TensorQuantInfoT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct TensorQuantInfoBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_scale(float scale) { + fbb_.AddElement(TensorQuantInfo::VT_SCALE, scale, 0.0f); + } + void add_zero(float zero) { + fbb_.AddElement(TensorQuantInfo::VT_ZERO, zero, 0.0f); + } + void add_min(float min) { + fbb_.AddElement(TensorQuantInfo::VT_MIN, min, -128.0f); + } + void add_max(float max) { + fbb_.AddElement(TensorQuantInfo::VT_MAX, max, 127.0f); + } + void add_type(DataType type) { + fbb_.AddElement(TensorQuantInfo::VT_TYPE, static_cast(type), 0); + } + explicit TensorQuantInfoBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + TensorQuantInfoBuilder &operator=(const TensorQuantInfoBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateTensorQuantInfo( + flatbuffers::FlatBufferBuilder &_fbb, + float scale = 0.0f, + float zero = 0.0f, + float min = -128.0f, + float max = 127.0f, + DataType type = DataType_DT_INVALID) { + TensorQuantInfoBuilder builder_(_fbb); + builder_.add_type(type); + builder_.add_max(max); + builder_.add_min(min); + builder_.add_zero(zero); + builder_.add_scale(scale); + return builder_.Finish(); +} + +flatbuffers::Offset CreateTensorQuantInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct NetT : public flatbuffers::NativeTable { typedef Net TableType; std::string bizCode; @@ -4715,6 +4882,7 @@ inline void TensorDescribe::UnPackTo(TensorDescribeT *_o, const flatbuffers::res { auto _e = index(); _o->index = _e; }; { auto _e = name(); if (_e) _o->name = _e->str(); }; { auto _e = regions(); if (_e) { _o->regions.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->regions[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; + { auto _e = quantInfo(); if (_e) _o->quantInfo = std::unique_ptr(_e->UnPack(_resolver)); }; } inline flatbuffers::Offset TensorDescribe::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorDescribeT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -4729,12 +4897,14 @@ inline flatbuffers::Offset CreateTensorDescribe(flatbuffers::Fla auto _index = _o->index; auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name); auto _regions = _o->regions.size() ? _fbb.CreateVector> (_o->regions.size(), [](size_t i, _VectorArgs *__va) { return CreateRegion(*__va->__fbb, __va->__o->regions[i].get(), __va->__rehasher); }, &_va ) : 0; + auto _quantInfo = _o->quantInfo ? CreateTensorQuantInfo(_fbb, _o->quantInfo.get(), _rehasher) : 0; return MNN::CreateTensorDescribe( _fbb, _blob, _index, _name, - _regions); + _regions, + _quantInfo); } inline SubGraphProtoT *SubGraphProto::UnPack(const flatbuffers::resolver_function_t *_resolver) const { @@ -4751,6 +4921,7 @@ inline void SubGraphProto::UnPackTo(SubGraphProtoT *_o, const flatbuffers::resol { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }; { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tensors[_i] = _e->Get(_i)->str(); } } }; { auto _e = nodes(); if (_e) { _o->nodes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->nodes[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; + { auto _e = extraTensorDescribe(); if (_e) { _o->extraTensorDescribe.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->extraTensorDescribe[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; } inline flatbuffers::Offset SubGraphProto::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphProtoT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -4766,13 +4937,53 @@ inline flatbuffers::Offset CreateSubGraphProto(flatbuffers::FlatB auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0; auto _tensors = _o->tensors.size() ? _fbb.CreateVectorOfStrings(_o->tensors) : 0; auto _nodes = _o->nodes.size() ? _fbb.CreateVector> (_o->nodes.size(), [](size_t i, _VectorArgs *__va) { return CreateOp(*__va->__fbb, __va->__o->nodes[i].get(), __va->__rehasher); }, &_va ) : 0; + auto _extraTensorDescribe = _o->extraTensorDescribe.size() ? _fbb.CreateVector> (_o->extraTensorDescribe.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorDescribe(*__va->__fbb, __va->__o->extraTensorDescribe[i].get(), __va->__rehasher); }, &_va ) : 0; return MNN::CreateSubGraphProto( _fbb, _name, _inputs, _outputs, _tensors, - _nodes); + _nodes, + _extraTensorDescribe); +} + +inline TensorQuantInfoT *TensorQuantInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new TensorQuantInfoT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void TensorQuantInfo::UnPackTo(TensorQuantInfoT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = scale(); _o->scale = _e; }; + { auto _e = zero(); _o->zero = _e; }; + { auto _e = min(); _o->min = _e; }; + { auto _e = max(); _o->max = _e; }; + { auto _e = type(); _o->type = _e; }; +} + +inline flatbuffers::Offset TensorQuantInfo::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateTensorQuantInfo(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateTensorQuantInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorQuantInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorQuantInfoT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _scale = _o->scale; + auto _zero = _o->zero; + auto _min = _o->min; + auto _max = _o->max; + auto _type = _o->type; + return MNN::CreateTensorQuantInfo( + _fbb, + _scale, + _zero, + _min, + _max, + _type); } inline NetT *Net::UnPack(const flatbuffers::resolver_function_t *_resolver) const { @@ -5196,6 +5407,10 @@ inline bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj, auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } + case OpParameter_GridSample: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } default: return false; } } @@ -5574,6 +5789,10 @@ inline void *OpParameterUnion::UnPack(const void *obj, OpParameter type, const f auto ptr = reinterpret_cast(obj); return ptr->UnPack(resolver); } + case OpParameter_GridSample: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } default: return nullptr; } } @@ -5940,6 +6159,10 @@ inline flatbuffers::Offset OpParameterUnion::Pack(flatbuffers::FlatBufferB auto ptr = reinterpret_cast(value); return CreateLSTMBlockCell(_fbb, ptr, _rehasher).Union(); } + case OpParameter_GridSample: { + auto ptr = reinterpret_cast(value); + return CreateGridSample(_fbb, ptr, _rehasher).Union(); + } default: return 0; } } @@ -6306,6 +6529,10 @@ inline OpParameterUnion::OpParameterUnion(const OpParameterUnion &u) FLATBUFFERS value = new LSTMBlockCellT(*reinterpret_cast(u.value)); break; } + case OpParameter_GridSample: { + value = new GridSampleT(*reinterpret_cast(u.value)); + break; + } default: break; } @@ -6763,6 +6990,11 @@ inline void OpParameterUnion::Reset() { delete ptr; break; } + case OpParameter_GridSample: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } default: break; } value = nullptr; @@ -6929,12 +7161,14 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() { { flatbuffers::ET_INT, 0, 0 }, { flatbuffers::ET_INT, 0, 0 }, { flatbuffers::ET_INT, 0, 0 }, + { flatbuffers::ET_INT, 0, 0 }, + { flatbuffers::ET_INT, 0, 0 }, { flatbuffers::ET_INT, 0, 0 } }; static const flatbuffers::TypeFunction type_refs[] = { OpTypeTypeTable }; - static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603 }; + static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 }; static const char * const names[] = { "AbsVal", "QuantizedAdd", @@ -7072,6 +7306,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() { "TensorArraySplit", "TensorArrayConcat", "LSTMBlockCell", + "Reverse", "Plugin", "Select", "ZerosLike", @@ -7094,10 +7329,11 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() { "EltwiseInt8", "While", "If", - "LayerNorm" + "LayerNorm", + "GridSample" }; static const flatbuffers::TypeTable tt = { - flatbuffers::ST_ENUM, 159, type_codes, type_refs, values, names + flatbuffers::ST_ENUM, 161, type_codes, type_refs, values, names }; return &tt; } @@ -7194,7 +7430,8 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() { { flatbuffers::ET_SEQUENCE, 0, 86 }, { flatbuffers::ET_SEQUENCE, 0, 87 }, { flatbuffers::ET_SEQUENCE, 0, 88 }, - { flatbuffers::ET_SEQUENCE, 0, 89 } + { flatbuffers::ET_SEQUENCE, 0, 89 }, + { flatbuffers::ET_SEQUENCE, 0, 90 } }; static const flatbuffers::TypeFunction type_refs[] = { QuantizedAddTypeTable, @@ -7286,7 +7523,8 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() { RandomUniformTypeTable, LayerNormTypeTable, TensorArrayTypeTable, - LSTMBlockCellTypeTable + LSTMBlockCellTypeTable, + GridSampleTypeTable }; static const char * const names[] = { "NONE", @@ -7379,10 +7617,11 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() { "RandomUniform", "LayerNorm", "TensorArray", - "LSTMBlockCell" + "LSTMBlockCell", + "GridSample" }; static const flatbuffers::TypeTable tt = { - flatbuffers::ST_UNION, 91, type_codes, type_refs, nullptr, names + flatbuffers::ST_UNION, 92, type_codes, type_refs, nullptr, names }; return &tt; } @@ -7602,20 +7841,23 @@ inline const flatbuffers::TypeTable *TensorDescribeTypeTable() { { flatbuffers::ET_SEQUENCE, 0, 0 }, { flatbuffers::ET_INT, 0, -1 }, { flatbuffers::ET_STRING, 0, -1 }, - { flatbuffers::ET_SEQUENCE, 1, 1 } + { flatbuffers::ET_SEQUENCE, 1, 1 }, + { flatbuffers::ET_SEQUENCE, 0, 2 } }; static const flatbuffers::TypeFunction type_refs[] = { BlobTypeTable, - RegionTypeTable + RegionTypeTable, + TensorQuantInfoTypeTable }; static const char * const names[] = { "blob", "index", "name", - "regions" + "regions", + "quantInfo" }; static const flatbuffers::TypeTable tt = { - flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, names + flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names }; return &tt; } @@ -7626,17 +7868,44 @@ inline const flatbuffers::TypeTable *SubGraphProtoTypeTable() { { flatbuffers::ET_INT, 1, -1 }, { flatbuffers::ET_INT, 1, -1 }, { flatbuffers::ET_STRING, 1, -1 }, - { flatbuffers::ET_SEQUENCE, 1, 0 } + { flatbuffers::ET_SEQUENCE, 1, 0 }, + { flatbuffers::ET_SEQUENCE, 1, 1 } }; static const flatbuffers::TypeFunction type_refs[] = { - OpTypeTable + OpTypeTable, + TensorDescribeTypeTable }; static const char * const names[] = { "name", "inputs", "outputs", "tensors", - "nodes" + "nodes", + "extraTensorDescribe" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, names + }; + return &tt; +} + +inline const flatbuffers::TypeTable *TensorQuantInfoTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_FLOAT, 0, -1 }, + { flatbuffers::ET_FLOAT, 0, -1 }, + { flatbuffers::ET_FLOAT, 0, -1 }, + { flatbuffers::ET_FLOAT, 0, -1 }, + { flatbuffers::ET_INT, 0, 0 } + }; + static const flatbuffers::TypeFunction type_refs[] = { + DataTypeTypeTable + }; + static const char * const names[] = { + "scale", + "zero", + "min", + "max", + "type" }; static const flatbuffers::TypeTable tt = { flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names diff --git a/schema/current/TensorflowOp_generated.h b/schema/current/TensorflowOp_generated.h index 847964fb..08cb1995 100644 --- a/schema/current/TensorflowOp_generated.h +++ b/schema/current/TensorflowOp_generated.h @@ -374,11 +374,12 @@ enum UnaryOpOperation { UnaryOpOperation_EXPM1 = 28, UnaryOpOperation_SIGMOID = 29, UnaryOpOperation_TANH = 30, + UnaryOpOperation_HARDSWISH = 31, UnaryOpOperation_MIN = UnaryOpOperation_ABS, - UnaryOpOperation_MAX = UnaryOpOperation_TANH + UnaryOpOperation_MAX = UnaryOpOperation_HARDSWISH }; -inline const UnaryOpOperation (&EnumValuesUnaryOpOperation())[31] { +inline const UnaryOpOperation (&EnumValuesUnaryOpOperation())[32] { static const UnaryOpOperation values[] = { UnaryOpOperation_ABS, UnaryOpOperation_NEG, @@ -410,7 +411,8 @@ inline const UnaryOpOperation (&EnumValuesUnaryOpOperation())[31] { UnaryOpOperation_ERFINV, UnaryOpOperation_EXPM1, UnaryOpOperation_SIGMOID, - UnaryOpOperation_TANH + UnaryOpOperation_TANH, + UnaryOpOperation_HARDSWISH }; return values; } @@ -448,13 +450,14 @@ inline const char * const *EnumNamesUnaryOpOperation() { "EXPM1", "SIGMOID", "TANH", + "HARDSWISH", nullptr }; return names; } inline const char *EnumNameUnaryOpOperation(UnaryOpOperation e) { - if (e < UnaryOpOperation_ABS || e > UnaryOpOperation_TANH) return ""; + if (e < UnaryOpOperation_ABS || e > UnaryOpOperation_HARDSWISH) return ""; const size_t index = static_cast(e); return EnumNamesUnaryOpOperation()[index]; } @@ -4981,6 +4984,7 @@ inline const flatbuffers::TypeTable *UnaryOpOperationTypeTable() { { flatbuffers::ET_INT, 0, 0 }, { flatbuffers::ET_INT, 0, 0 }, { flatbuffers::ET_INT, 0, 0 }, + { flatbuffers::ET_INT, 0, 0 }, { flatbuffers::ET_INT, 0, 0 } }; static const flatbuffers::TypeFunction type_refs[] = { @@ -5017,10 +5021,11 @@ inline const flatbuffers::TypeTable *UnaryOpOperationTypeTable() { "ERFINV", "EXPM1", "SIGMOID", - "TANH" + "TANH", + "HARDSWISH" }; static const flatbuffers::TypeTable tt = { - flatbuffers::ST_ENUM, 31, type_codes, type_refs, nullptr, names + flatbuffers::ST_ENUM, 32, type_codes, type_refs, nullptr, names }; return &tt; } diff --git a/schema/current/UserDefine_generated.h b/schema/current/UserDefine_generated.h index 7935cf21..b32f5ad4 100644 --- a/schema/current/UserDefine_generated.h +++ b/schema/current/UserDefine_generated.h @@ -13,8 +13,76 @@ namespace MNN { struct TensorConvertInfo; struct TensorConvertInfoT; +struct GridSample; +struct GridSampleT; + inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable(); +inline const flatbuffers::TypeTable *GridSampleTypeTable(); + +enum SampleMode { + SampleMode_BILINEAR = 0, + SampleMode_NEAREST = 1, + SampleMode_MIN = SampleMode_BILINEAR, + SampleMode_MAX = SampleMode_NEAREST +}; + +inline const SampleMode (&EnumValuesSampleMode())[2] { + static const SampleMode values[] = { + SampleMode_BILINEAR, + SampleMode_NEAREST + }; + return values; +} + +inline const char * const *EnumNamesSampleMode() { + static const char * const names[] = { + "BILINEAR", + "NEAREST", + nullptr + }; + return names; +} + +inline const char *EnumNameSampleMode(SampleMode e) { + if (e < SampleMode_BILINEAR || e > SampleMode_NEAREST) return ""; + const size_t index = static_cast(e); + return EnumNamesSampleMode()[index]; +} + +enum BorderMode { + BorderMode_ZEROS = 0, + BorderMode_CLAMP = 1, + BorderMode_REFLECTION = 2, + BorderMode_MIN = BorderMode_ZEROS, + BorderMode_MAX = BorderMode_REFLECTION +}; + +inline const BorderMode (&EnumValuesBorderMode())[3] { + static const BorderMode values[] = { + BorderMode_ZEROS, + BorderMode_CLAMP, + BorderMode_REFLECTION + }; + return values; +} + +inline const char * const *EnumNamesBorderMode() { + static const char * const names[] = { + "ZEROS", + "CLAMP", + "REFLECTION", + nullptr + }; + return names; +} + +inline const char *EnumNameBorderMode(BorderMode e) { + if (e < BorderMode_ZEROS || e > BorderMode_REFLECTION) return ""; + const size_t index = static_cast(e); + return EnumNamesBorderMode()[index]; +} + struct TensorConvertInfoT : public flatbuffers::NativeTable { typedef TensorConvertInfo TableType; MNN_DATA_FORMAT source; @@ -84,6 +152,87 @@ inline flatbuffers::Offset CreateTensorConvertInfo( flatbuffers::Offset CreateTensorConvertInfo(flatbuffers::FlatBufferBuilder &_fbb, const TensorConvertInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct GridSampleT : public flatbuffers::NativeTable { + typedef GridSample TableType; + SampleMode mode; + BorderMode paddingMode; + bool alignCorners; + GridSampleT() + : mode(SampleMode_BILINEAR), + paddingMode(BorderMode_ZEROS), + alignCorners(false) { + } +}; + +struct GridSample FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef GridSampleT NativeTableType; + static const flatbuffers::TypeTable *MiniReflectTypeTable() { + return GridSampleTypeTable(); + } + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_MODE = 4, + VT_PADDINGMODE = 6, + VT_ALIGNCORNERS = 8 + }; + SampleMode mode() const { + return static_cast(GetField(VT_MODE, 0)); + } + BorderMode paddingMode() const { + return static_cast(GetField(VT_PADDINGMODE, 0)); + } + bool alignCorners() const { + return GetField(VT_ALIGNCORNERS, 0) != 0; + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_MODE) && + VerifyField(verifier, VT_PADDINGMODE) && + VerifyField(verifier, VT_ALIGNCORNERS) && + verifier.EndTable(); + } + GridSampleT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(GridSampleT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct GridSampleBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_mode(SampleMode mode) { + fbb_.AddElement(GridSample::VT_MODE, static_cast(mode), 0); + } + void add_paddingMode(BorderMode paddingMode) { + fbb_.AddElement(GridSample::VT_PADDINGMODE, static_cast(paddingMode), 0); + } + void add_alignCorners(bool alignCorners) { + fbb_.AddElement(GridSample::VT_ALIGNCORNERS, static_cast(alignCorners), 0); + } + explicit GridSampleBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + GridSampleBuilder &operator=(const GridSampleBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateGridSample( + flatbuffers::FlatBufferBuilder &_fbb, + SampleMode mode = SampleMode_BILINEAR, + BorderMode paddingMode = BorderMode_ZEROS, + bool alignCorners = false) { + GridSampleBuilder builder_(_fbb); + builder_.add_alignCorners(alignCorners); + builder_.add_paddingMode(paddingMode); + builder_.add_mode(mode); + return builder_.Finish(); +} + +flatbuffers::Offset CreateGridSample(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + inline TensorConvertInfoT *TensorConvertInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new TensorConvertInfoT(); UnPackTo(_o, _resolver); @@ -113,6 +262,76 @@ inline flatbuffers::Offset CreateTensorConvertInfo(flatbuffer _dest); } +inline GridSampleT *GridSample::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new GridSampleT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void GridSample::UnPackTo(GridSampleT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = mode(); _o->mode = _e; }; + { auto _e = paddingMode(); _o->paddingMode = _e; }; + { auto _e = alignCorners(); _o->alignCorners = _e; }; +} + +inline flatbuffers::Offset GridSample::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateGridSample(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateGridSample(flatbuffers::FlatBufferBuilder &_fbb, const GridSampleT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GridSampleT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _mode = _o->mode; + auto _paddingMode = _o->paddingMode; + auto _alignCorners = _o->alignCorners; + return MNN::CreateGridSample( + _fbb, + _mode, + _paddingMode, + _alignCorners); +} + +inline const flatbuffers::TypeTable *SampleModeTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_CHAR, 0, 0 }, + { flatbuffers::ET_CHAR, 0, 0 } + }; + static const flatbuffers::TypeFunction type_refs[] = { + SampleModeTypeTable + }; + static const char * const names[] = { + "BILINEAR", + "NEAREST" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, names + }; + return &tt; +} + +inline const flatbuffers::TypeTable *BorderModeTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_CHAR, 0, 0 }, + { flatbuffers::ET_CHAR, 0, 0 }, + { flatbuffers::ET_CHAR, 0, 0 } + }; + static const flatbuffers::TypeFunction type_refs[] = { + BorderModeTypeTable + }; + static const char * const names[] = { + "ZEROS", + "CLAMP", + "REFLECTION" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, names + }; + return &tt; +} + inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable() { static const flatbuffers::TypeCode type_codes[] = { { flatbuffers::ET_CHAR, 0, 0 }, @@ -131,6 +350,27 @@ inline const flatbuffers::TypeTable *TensorConvertInfoTypeTable() { return &tt; } +inline const flatbuffers::TypeTable *GridSampleTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_CHAR, 0, 0 }, + { flatbuffers::ET_CHAR, 0, 1 }, + { flatbuffers::ET_BOOL, 0, -1 } + }; + static const flatbuffers::TypeFunction type_refs[] = { + SampleModeTypeTable, + BorderModeTypeTable + }; + static const char * const names[] = { + "mode", + "paddingMode", + "alignCorners" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, names + }; + return &tt; +} + } // namespace MNN #endif // FLATBUFFERS_GENERATED_USERDEFINE_MNN_H_ diff --git a/schema/default/MNN.fbs b/schema/default/MNN.fbs index d057b3d4..d0f0907d 100644 --- a/schema/default/MNN.fbs +++ b/schema/default/MNN.fbs @@ -153,6 +153,7 @@ enum OpType : int { TensorArraySplit = 139, TensorArrayConcat = 140, LSTMBlockCell = 141, + Reverse = 142, Plugin = 256, //The Type load from plugin //Training Op Start from 257 @@ -183,6 +184,7 @@ enum OpType : int { While = 600, If = 601, LayerNorm = 603, + GridSample = 604, } table Plugin { @@ -328,6 +330,7 @@ union OpParameter { LayerNorm, TensorArray, LSTMBlockCell, + GridSample, } table Op { @@ -356,6 +359,7 @@ table TensorDescribe { index: int; name: string; regions:[Region]; + quantInfo:TensorQuantInfo; } enum ForwardType : byte { @@ -387,6 +391,17 @@ table SubGraphProto { // Nodes of the subgraph. nodes: [Op]; + + // Tensor describe info + extraTensorDescribe:[TensorDescribe]; +} + +table TensorQuantInfo { + scale:float; + zero:float = 0; + min:float = -128; + max:float = 127; + type:DataType; } table Net { diff --git a/schema/default/TensorflowOp.fbs b/schema/default/TensorflowOp.fbs index 483bf7c1..d8f387a8 100644 --- a/schema/default/TensorflowOp.fbs +++ b/schema/default/TensorflowOp.fbs @@ -139,6 +139,7 @@ enum UnaryOpOperation : int { EXPM1 = 28, SIGMOID = 29, TANH = 30, + HARDSWISH = 31, } table UnaryOp { diff --git a/schema/default/UserDefine.fbs b/schema/default/UserDefine.fbs index 5a465697..508108b1 100644 --- a/schema/default/UserDefine.fbs +++ b/schema/default/UserDefine.fbs @@ -4,3 +4,19 @@ table TensorConvertInfo { source:MNN_DATA_FORMAT; dest:MNN_DATA_FORMAT; } + +enum SampleMode : byte { + BILINEAR=0, + NEAREST +} +enum BorderMode : byte { + ZEROS=0, + CLAMP, + REFLECTION +} + +table GridSample { + mode:SampleMode; + paddingMode:BorderMode; + alignCorners:bool=false; +} diff --git a/source/backend/arm82/Arm82Backend.cpp b/source/backend/arm82/Arm82Backend.cpp index 0c0622b2..06e85a12 100644 --- a/source/backend/arm82/Arm82Backend.cpp +++ b/source/backend/arm82/Arm82Backend.cpp @@ -5,17 +5,18 @@ // Created by MNN on 2019/01/31. // Copyright © 2018, Alibaba Group Holding Limited // - -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) #include #include -#include "backend/arm82/Arm82Backend.hpp" -#include "backend/arm82/Arm82OptFunc.hpp" +#include "Arm82Backend.hpp" +#include "Arm82OptFunc.hpp" +#include "Arm82Functions.hpp" #include "core/BufferAllocator.hpp" #include "core/TensorUtils.hpp" - +#include "core/OpCommonUtils.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" #include "half.hpp" namespace MNN { @@ -37,8 +38,8 @@ bool Arm82Backend::addArm82Creator(OpType t, Arm82Creator* ct) { return true; } -Arm82Backend::Arm82Backend(const CPURuntime* runtime) : CPUBackend(runtime, MNN_FORWARD_CPU_EXTENSION) { - // nothing to do +Arm82Backend::Arm82Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) { + mCoreFunctions = Arm82Functions::get(); } Arm82Backend::~Arm82Backend() { @@ -52,6 +53,14 @@ Execution* Arm82Backend::onCreate(const std::vector& inputs, const std: return nullptr; } } + auto quantInfo = OpCommonUtils::getQuantInfo(inputs); + if (quantInfo.first) { + return nullptr; + } + bool originCreate = OpCommonUtils::opCompabilityForLowp(op); + if (originCreate) { + return CPUBackend::onCreate(inputs, outputs, op); + } auto creatorContainer = getArm82CreatorContainer(); // MNN_PRINT("====> create Execution for type: %s\n", MNN::EnumNameOpType(op->type())); auto iter = creatorContainer->find(op->type()); @@ -88,7 +97,7 @@ bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType stora // arm82 backend tensor data type is fp16 default auto tensor = const_cast(nativeTensor); auto& buffer = tensor->buffer(); - if (buffer.type != halide_type_of()) { + if (buffer.type != halide_type_of() && buffer.type != halide_type_of()) { return CPUBackend::onAcquireBuffer(nativeTensor, storageType); } auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType); @@ -128,7 +137,7 @@ static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t& const int outBatchStide = channel * area; for (int i = 0; i < batch; ++i) { - MNNNC8HW8TONCHW_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area, + MNNUnPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area, channel); } return; @@ -138,7 +147,7 @@ static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t& const int inbatchStride = channel * area; const int outBatchStide = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT; for (int i = 0; i < batch; ++i) { - MNNNCHWTONC8HW8_NO_TYPE((uint16_t*)ob.host + outBatchStide * i, (const uint16_t*)ib.host + inbatchStride * i, area, + MNNPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area, channel); } return; @@ -200,14 +209,14 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor const int outBatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT; const int inbatchStride = UP_DIV(channel, 4) * area * 4; for (int i = 0; i < batch; ++i) { - MNNNC4HW4TONC8HW8(dstTensor->host() + outBatchStride * i, srcTensor->host() + inbatchStride * i, area, + MNNNC4HW4TONC8HW8(dstTensor->host() + outBatchStride * i, srcTensor->host() + inbatchStride * i, area, channel); } } else { const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT; const int outBatchStide = UP_DIV(channel, 4) * area * 4; for (int i = 0; i < batch; ++i) { - MNNNC8HW8TONC4HW4(dstTensor->host() + outBatchStide * i, srcTensor->host() + inbatchStride * i, area, + MNNNC8HW8TONC4HW4(dstTensor->host() + outBatchStide * i, srcTensor->host() + inbatchStride * i, area, channel); } } @@ -220,15 +229,15 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor // cpu -> arm82 copy if (srcType == MNN_FORWARD_CPU) { const auto src = srcTensor->host(); - auto dst = dstTensor->host(); - MNNQuantizeFP16(dst, src, elemenSize); + auto dst = dstTensor->host(); + MNNQuantizeFP16(src, dst, elemenSize); return; } // arm82 -> cpu copy if (srcType == MNN_FORWARD_CPU_EXTENSION) { const auto src = srcTensor->host(); auto dst = dstTensor->host(); - MNNDequantizeFP16(dst, src, elemenSize); + MNNDequantizeFP16(src, dst, elemenSize); return; } MNN_ERROR("Invalide copy for intenal Arm82 Backend\n"); @@ -236,6 +245,7 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor } void registerArm82RuntimeCreator() { + Arm82Functions::init(); registerArm82Ops(); }; #ifndef MNN_CODEGEN_REGISTER @@ -246,5 +256,4 @@ static const auto __arm82_global_initializer = []() { #endif } // namespace MNN - #endif diff --git a/source/backend/arm82/Arm82Backend.hpp b/source/backend/arm82/Arm82Backend.hpp index 049ab61c..0dd084e2 100644 --- a/source/backend/arm82/Arm82Backend.hpp +++ b/source/backend/arm82/Arm82Backend.hpp @@ -5,19 +5,25 @@ // Created by MNN on 2019/01/31. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #ifndef Arm82Backend_hpp #define Arm82Backend_hpp #include "backend/cpu/CPUBackend.hpp" #include "core/Macro.h" #include "core/TensorUtils.hpp" +#include // armv82's data type default is fp16, so set // armv82's dataformat: NC8HW8 #define ARMV82_CHANNEL_UNIT 8 typedef __fp16 FLOAT16; +template<> +HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { + return halide_type_t(halide_type_float, 16); +} namespace MNN { class Arm82Backend : public CPUBackend { @@ -60,8 +66,19 @@ inline int ARM82TensorElementSizeHelper(const Tensor* t) { return size; } +inline int ARM82TensorStrideHelper(const Tensor* t, int dim) { + int size = 1; + for (int i = t->dimensions() - 1; i > dim; i--) { + int currentDimSize = t->length(i); + if (TensorUtils::getDescribe(t)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) { + currentDimSize = UP_DIV(currentDimSize, 8) * 8; + } + size *= currentDimSize; + } + return size; +} + } // namespace MNN #endif /* Arm82Backend_hpp */ - #endif diff --git a/source/backend/arm82/Arm82Binary.cpp b/source/backend/arm82/Arm82Binary.cpp index 4015ec55..bf5b19ad 100644 --- a/source/backend/arm82/Arm82Binary.cpp +++ b/source/backend/arm82/Arm82Binary.cpp @@ -6,7 +6,7 @@ // Copyright © 2021, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) #include #include "backend/arm82/Arm82Binary.hpp" #include "backend/arm82/Arm82Backend.hpp" diff --git a/source/backend/arm82/Arm82Binary.hpp b/source/backend/arm82/Arm82Binary.hpp index 4f69d960..50a23018 100644 --- a/source/backend/arm82/Arm82Binary.hpp +++ b/source/backend/arm82/Arm82Binary.hpp @@ -5,7 +5,8 @@ // Created by MNN on 2021/01/05. // Copyright © 2021, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #ifndef Arm82Binary_hpp #define Arm82Binary_hpp diff --git a/source/backend/arm82/Arm82Convolution.cpp b/source/backend/arm82/Arm82Convolution.cpp deleted file mode 100644 index d1f9455b..00000000 --- a/source/backend/arm82/Arm82Convolution.cpp +++ /dev/null @@ -1,471 +0,0 @@ -// -// Arm82Convolution.cpp -// MNN -// -// Created by MNN on 2020/01/07. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ -#include "backend/arm82/Arm82Convolution.hpp" -#include "backend/arm82/Arm82Backend.hpp" -#include "backend/arm82/Arm82Convolution3x3.hpp" -#include "backend/arm82/Arm82OptFunc.hpp" -#include "core/Concurrency.h" -#include "core/Macro.h" -#include "core/TensorUtils.hpp" -#include "core/ConvolutionCommon.hpp" - -#ifdef MNN_USE_NEON -#include -#endif - -namespace MNN { - -#ifndef MNN_USE_NEON -static void MNNGemmFP16C8_UNIT(FLOAT16 *dst, const FLOAT16 *src, const FLOAT16 *weight, const FLOAT16 *bias, - size_t src_loop, size_t dst_step, size_t dst_loop, size_t relu, size_t relu6, - size_t realDstCount) { - const auto dst_step_tmp = dst_step / sizeof(FLOAT16); - - for (int dz = 0; dz < dst_loop; ++dz) { - const auto weight_dz = weight + dz * src_loop * (ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT); - const auto bias_dz = bias + dz * ARMV82_CHANNEL_UNIT; - auto dst_z = dst + dz * dst_step_tmp; - for (int w = 0; w < DST_XUNIT; ++w) { - const auto src_x = src + w * ARMV82_CHANNEL_UNIT; - auto dst_x = dst_z + w * ARMV82_CHANNEL_UNIT; - FLOAT16 dstTemp[ARMV82_CHANNEL_UNIT]; - - memcpy(dstTemp, bias_dz, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); - - // MAC - for (int sz = 0; sz < src_loop; ++sz) { - const auto weight_sz = weight_dz + (ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT) * sz; - const auto src_z = src_x + sz * DST_XUNIT * ARMV82_CHANNEL_UNIT; - - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) { - dstTemp[j] += src_z[i] * weight_sz[i * ARMV82_CHANNEL_UNIT + j]; - } - } - } // end MAC - - if (relu) { - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - if (dstTemp[j] < 0) { - dstTemp[j] = 0; - } - } - } - if (relu6) { - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - if (dstTemp[j] < 0) { - dstTemp[j] = 0; - } - if (dstTemp[j] > 6) { - dstTemp[j] = 6.0; - } - } - } - - memcpy(dst_x, dstTemp, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); - } - } -} -#endif - -static void Im2ColTransformer(FLOAT16 *dst, const FLOAT16 *src, ConvolutionCommon::Im2ColParameter *im2colParam, - size_t xIndexStart, size_t realDstCount) { - { - const int colBufferSize = im2colParam->kernelCountUnit * DST_XUNIT * ARMV82_CHANNEL_UNIT * sizeof(FLOAT16); - memset(dst, 0, colBufferSize); - } - // src data format is nc8hw8 - - const auto ih = im2colParam->ih; - const auto iw = im2colParam->iw; - // const auto oh = im2colParameter->oh; - const auto ow = im2colParam->ow; - const auto kh = im2colParam->kernelY; - const auto kw = im2colParam->kernelX; - const auto dilateX = im2colParam->dilateX; - const auto dilateY = im2colParam->dilateY; - const auto icDiv4 = im2colParam->icDiv4; - const auto srcChannleStride = iw * ih * ARMV82_CHANNEL_UNIT; - const auto stridex = im2colParam->strideX; - const auto stridey = im2colParam->strideY; - const auto padx = im2colParam->padX; - const auto pady = im2colParam->padY; - constexpr int dstXStep = ARMV82_CHANNEL_UNIT * DST_XUNIT; - - for (int i = 0; i < realDstCount; ++i) { - int xIndex = (int)xIndexStart + i; - int ox = xIndex % ow; - int oy = xIndex / ow; - int sx = ox * stridex - padx; - int sy = oy * stridey - pady; - int sfy = ALIMAX(0, (UP_DIV(-sy, dilateY))); - int efy = ALIMIN(kh, UP_DIV(ih - sy, dilateY)); - int sfx = ALIMAX(0, (UP_DIV(-sx, dilateX))); - int efx = ALIMIN(kw, UP_DIV(iw - sx, dilateX)); - int fyC = efy - sfy; - int fxC = efx - sfx; - - auto colAddrI = dst + ARMV82_CHANNEL_UNIT * i; - auto inputOffset = src + (sx + sfx * dilateX + (sy + sfy * dilateY) * iw) * ARMV82_CHANNEL_UNIT; - auto indexOffset = (sfy * kw + sfx) * icDiv4; - - for (int fy = 0; fy < fyC; ++fy) { - for (int fx = 0; fx < fxC; ++fx) { - auto inputUnit = inputOffset + (fx * dilateX + fy * dilateY * iw) * ARMV82_CHANNEL_UNIT; - auto indexStart = (indexOffset + (fy * kw + fx) * icDiv4) * dstXStep; - for (int sz = 0; sz < icDiv4; ++sz) { - auto dstUnit = colAddrI + indexStart + sz * dstXStep; - memcpy(dstUnit, inputUnit, ARMV82_CHANNEL_UNIT * sizeof(FLOAT16)); - inputUnit += srcChannleStride; - } - } - } - } - - // shuffle channel -#ifdef MNN_USE_NEON - if (realDstCount > (DST_XUNIT / 2)) { - MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 0); - } else { - MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 1); - } -#endif -} - -static void Im2ColTransformer1x1(FLOAT16 *dst, const FLOAT16 *src, ConvolutionCommon::Im2ColParameter *im2colParam, - size_t xIndexStart, size_t realDstCount) { - { - const int colBufferSize = im2colParam->kernelCountUnit * DST_XUNIT * ARMV82_CHANNEL_UNIT * sizeof(FLOAT16); - memset(dst, 0, colBufferSize); - } - // src data format is nc8hw8 - const auto ih = im2colParam->ih; - const auto iw = im2colParam->iw; - - const auto icDiv8 = im2colParam->icDiv4; - const auto srcChannleStride = iw * ih * ARMV82_CHANNEL_UNIT; - constexpr int dstXStep = ARMV82_CHANNEL_UNIT * DST_XUNIT; - const auto srcStartPtr = src + xIndexStart * ARMV82_CHANNEL_UNIT; - - for (int c = 0; c < icDiv8; ++c) { - memcpy(dst + c * dstXStep, srcStartPtr + c * srcChannleStride, - sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT * realDstCount); - } - -// shuffle channel -#ifdef MNN_USE_NEON - if (realDstCount > (DST_XUNIT / 2)) { - MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 0); - } else { - MNNShuffleChannelC8(dst, dst, (size_t)im2colParam->kernelCountUnit, 1); - } -#endif -} - -Arm82Convolution::Arm82Convolution(const MNN::Convolution2D *convParam, Backend *bn) : Execution(bn) { - const auto convCommon = convParam->common(); - mCommon = convCommon; - const int kx = convCommon->kernelX(); - const int ky = convCommon->kernelY(); - const int kernelCount = kx * ky; - int inputChannel = convCommon->inputCount(); - const int outputChannel = convCommon->outputCount(); - if (inputChannel == 0) { - if (convParam->quanParameter()) { - inputChannel = convParam->quanParameter()->buffer()->size() / (2 * kernelCount * outputChannel); - } else { - inputChannel = convParam->weight()->size() / (kernelCount * outputChannel); - } - } - const int inputChannelUnit = UP_DIV(inputChannel, ARMV82_CHANNEL_UNIT); - const int outputChannelUnit = UP_DIV(outputChannel, ARMV82_CHANNEL_UNIT); - - const int totalKernelCountUnit = kernelCount * inputChannelUnit; - mWeightFp16.reset(Tensor::createDevice( - {outputChannelUnit, totalKernelCountUnit, ARMV82_CHANNEL_UNIT, ARMV82_CHANNEL_UNIT})); - auto allocRes = bn->onAcquireBuffer(mWeightFp16.get(), Backend::STATIC); - if (!allocRes) { - mValid = false; - return; - } - - auto weightFp16DstPtr = mWeightFp16->host(); - memset(weightFp16DstPtr, 0, mWeightFp16->size()); - - const FLOAT16 *fp16WeightPtr = nullptr; - std::vector weightFp16; - if (convParam->quanParameter()) { - MNN_ASSERT((convParam->quanParameter()->type() == 3) || (convParam->quanParameter()->type() == 4)); - if (convParam->quanParameter()->type() == 3) { - // the data type of weight is fp16 - fp16WeightPtr = reinterpret_cast(convParam->quanParameter()->buffer()->data()); - } - if (convParam->quanParameter()->type() == 4) { - std::shared_ptr quanCommon; - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), true); - int weightCount = convParam->quanParameter()->buffer()->size(); - weightFp16.resize(weightCount); - MNNQuantizeFP16(weightFp16.data(), quanCommon->weightFloat.get(), weightCount); - fp16WeightPtr = weightFp16.data(); - } - } else { - // the data type of weight is fp32, then quantize weight to be fp16 data type - int size = convParam->weight()->size(); - weightFp16.resize(size); - MNNQuantizeFP16(weightFp16.data(), convParam->weight()->data(), size); - fp16WeightPtr = weightFp16.data(); - } - - auto weightFp16SrcPtr = fp16WeightPtr; - - const int oneChannleKernelSize = kernelCount * inputChannel; - -#ifdef MNN_USE_NEON - int curOcChannel = 0; - auto reorderWeight = [&](int ocUnit, int ocUnitNum, const FLOAT16 *weightSrc, FLOAT16 *weightDst) { - for (int oc = 0; oc < ocUnitNum; ++oc) { - auto weightDstOcUnit = weightDst + oc * kernelCount * inputChannelUnit * ARMV82_CHANNEL_UNIT * ocUnit; - const auto weightSrcOc = weightSrc + oc * ocUnit * oneChannleKernelSize; - for (int k = 0; k < kernelCount; ++k) { - auto weightDstK = weightDstOcUnit + k * inputChannelUnit * ARMV82_CHANNEL_UNIT * ocUnit; - const auto weightSrcK = weightSrcOc + k; - for (int y = 0; y < inputChannel; ++y) { - const int yOutSide = y / ARMV82_CHANNEL_UNIT; - const int yInSide = y % ARMV82_CHANNEL_UNIT; - auto weightDstIc = weightDstK + yOutSide * ARMV82_CHANNEL_UNIT * ocUnit + yInSide * ocUnit; - const auto weigthSrcIc = weightSrcK + y * kernelCount; - - for (int x = 0; x < ocUnit; ++x) { - if (curOcChannel + x < outputChannel) { - weightDstIc[x] = weigthSrcIc[x * oneChannleKernelSize]; - } - } - } - } - curOcChannel += ocUnit; - } - }; - const int ocDivDoubleUnit = outputChannelUnit / 2; - // reorder weight in double ARMV82_CHANNEL_UNIT - reorderWeight((ARMV82_CHANNEL_UNIT * 2), ocDivDoubleUnit, weightFp16SrcPtr, weightFp16DstPtr); - auto weightRemainDst = weightFp16DstPtr + kernelCount * inputChannelUnit * ARMV82_CHANNEL_UNIT * ocDivDoubleUnit * - (ARMV82_CHANNEL_UNIT * 2); - auto weightRemainSrc = weightFp16SrcPtr + kernelCount * inputChannel * ocDivDoubleUnit * (ARMV82_CHANNEL_UNIT * 2); - if (outputChannelUnit % 2 == 1) { - // reorder weight in ARMV82_CHANNEL_UNIT - reorderWeight(ARMV82_CHANNEL_UNIT, 1, weightRemainSrc, weightRemainDst); - } -#else - // reorder weight - const int ocUnitStride = inputChannelUnit * ARMV82_CHANNEL_UNIT * kernelCount * ARMV82_CHANNEL_UNIT; - for (int k = 0; k < kernelCount; ++k) { - const auto weightSrcK = weightFp16SrcPtr + k; - auto weightDstK = weightFp16DstPtr + k * inputChannelUnit * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT; - for (int y = 0; y < inputChannel; ++y) { - const int yOutSide = y / ARMV82_CHANNEL_UNIT; - const int yInSide = y % ARMV82_CHANNEL_UNIT; - - auto dstY = - weightDstK + yOutSide * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT + yInSide * ARMV82_CHANNEL_UNIT; - const auto srcY = weightSrcK + y * kernelCount; - for (int x = 0; x < outputChannel; ++x) { - const int xOutSide = x / ARMV82_CHANNEL_UNIT; - const int xInSide = x % ARMV82_CHANNEL_UNIT; - const int dstIndex = xOutSide * ocUnitStride + xInSide; - const int srcIndex = x * oneChannleKernelSize; - dstY[dstIndex] = srcY[srcIndex]; - } - } - } -#endif - - mBiasFp16.reset(Tensor::createDevice({outputChannelUnit * ARMV82_CHANNEL_UNIT})); - allocRes = bn->onAcquireBuffer(mBiasFp16.get(), Backend::STATIC); - if (!allocRes) { - mValid = false; - return; - } - - // TODO, bias is fp32, save bias also in fp16? - auto biasDstPtr = mBiasFp16->host(); - memset(biasDstPtr, 0, mBiasFp16->size()); - MNNQuantizeFP16(biasDstPtr, convParam->bias()->data(), outputChannel); - - mIm2ColParamter.dilateX = convCommon->dilateX(); - mIm2ColParamter.dilateY = convCommon->dilateY(); - mIm2ColParamter.strideX = convCommon->strideX(); - mIm2ColParamter.strideY = convCommon->strideY(); - mIm2ColParamter.padX = convCommon->padX(); - mIm2ColParamter.padY = convCommon->padY(); - mIm2ColParamter.icDiv4 = inputChannelUnit; - mIm2ColParamter.kernelX = convCommon->kernelX(); - mIm2ColParamter.kernelY = convCommon->kernelY(); - mIm2ColParamter.kernelCountUnit = totalKernelCountUnit; - - mRelu6 = convCommon->relu6(); - mRelu = convCommon->relu(); -} - -Arm82Convolution::~Arm82Convolution() { - if (mWeightFp16 != nullptr) { - backend()->onReleaseBuffer(mWeightFp16.get(), Backend::STATIC); - } - if (mBiasFp16 != nullptr) { - backend()->onReleaseBuffer(mBiasFp16.get(), Backend::STATIC); - } -} - -ErrorCode Arm82Convolution::onResize(const std::vector &inputs, const std::vector &outputs) { - auto input = inputs[0]; - auto output = outputs[0]; - - mIm2ColParamter.padX = mCommon->padX(); - mIm2ColParamter.padY = mCommon->padY(); - if (mCommon->padMode() == PadMode_SAME) { - int kernelWidthSize = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1; - int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1; - - int padNeededWidth = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width(); - int padNeededHeight = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height(); - mIm2ColParamter.padX = padNeededWidth / 2; - mIm2ColParamter.padY = padNeededHeight / 2; - } - - mIm2ColParamter.ih = input->height(); - mIm2ColParamter.iw = input->width(); - mIm2ColParamter.oh = output->height(); - mIm2ColParamter.ow = output->width(); - - mTileCount = UP_DIV(output->height() * output->width(), DST_XUNIT); - const int threads = std::max(1, static_cast(backend())->numberThread()); - mThreadNums = std::min(threads, mTileCount); - - mIm2ColBuffer.setType(DataType_DT_BFLOAT16); - mIm2ColBuffer.buffer().dimensions = 3; - mIm2ColBuffer.setLength(0, mThreadNums); - mIm2ColBuffer.setLength(1, DST_XUNIT); - mIm2ColBuffer.setLength(2, mWeightFp16->length(1) * ARMV82_CHANNEL_UNIT); - TensorUtils::setLinearLayout(&mIm2ColBuffer); - - mRemainBuffer.setType(DataType_DT_BFLOAT16); - mRemainBuffer.buffer().dimensions = 3; - mRemainBuffer.setLength(0, mThreadNums); - mRemainBuffer.setLength(1, DST_XUNIT); - mRemainBuffer.setLength(2, UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT) * ARMV82_CHANNEL_UNIT); - TensorUtils::setLinearLayout(&mRemainBuffer); - bool success = backend()->onAcquireBuffer(&mIm2ColBuffer, Backend::DYNAMIC); - success = success && backend()->onAcquireBuffer(&mRemainBuffer, Backend::DYNAMIC); - if (!success) { - return OUT_OF_MEMORY; - } - - backend()->onReleaseBuffer(&mIm2ColBuffer, Backend::DYNAMIC); - backend()->onReleaseBuffer(&mRemainBuffer, Backend::DYNAMIC); - - return NO_ERROR; -} - -ErrorCode Arm82Convolution::onExecute(const std::vector &inputs, const std::vector &outputs) { - auto input = inputs[0]; - auto output = outputs[0]; - const int outputPlaneLen = output->height() * output->width(); - - const int dstZStep = outputPlaneLen * ARMV82_CHANNEL_UNIT; - const int batch = input->batch(); - const int ocDiv8 = UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT); - const int kernelCountUnit = mIm2ColParamter.kernelCountUnit; - - const auto inputDataPtr = input->host(); - const auto weightDataPtr = mWeightFp16->host(); - const auto biasDataPtr = mBiasFp16->host(); - auto im2ColPtr = mIm2ColBuffer.host(); - auto outputDataPtr = output->host(); - auto remainDataPtr = mRemainBuffer.host(); - - auto im2ColProcess = Im2ColTransformer; - bool useFastIm2Col = mIm2ColParamter.kernelX == 1 && mIm2ColParamter.kernelY == 1 && mIm2ColParamter.strideX == 1 && - mIm2ColParamter.strideY == 1 && mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0; - - if (useFastIm2Col) { - im2ColProcess = Im2ColTransformer1x1; - } - - const int inBatchStride = ROUND_UP(input->channel(), ARMV82_CHANNEL_UNIT) * input->height() * input->width(); - const int outBatchStride = ocDiv8 * dstZStep; - for (int bIndex = 0; bIndex < batch; ++bIndex) { - const auto srcBatchPtr = inputDataPtr + bIndex * inBatchStride; - auto dstBatchPtr = outputDataPtr + bIndex * outBatchStride; - - auto threadFunction = [&](int tId) { - auto im2ColCurPtr = im2ColPtr + tId * mIm2ColBuffer.stride(0); - auto gemmOutputPtr = remainDataPtr + tId * mRemainBuffer.stride(0); - - for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) { - const int xIndexStart = tIndex * DST_XUNIT; - const int realDstCount = ALIMIN(outputPlaneLen - xIndexStart, DST_XUNIT); - - Im2ColTransformer(im2ColCurPtr, srcBatchPtr, &mIm2ColParamter, xIndexStart, realDstCount); - - auto outputCurTilePtr = dstBatchPtr + xIndexStart * ARMV82_CHANNEL_UNIT; - - if (realDstCount == DST_XUNIT) { - // compute one tile - MNNGemmFP16C8_UNIT(outputCurTilePtr, im2ColCurPtr, weightDataPtr, biasDataPtr, kernelCountUnit, - dstZStep * sizeof(FLOAT16), ocDiv8, mRelu, mRelu6, realDstCount); - } else { - // compute the remain - MNNGemmFP16C8_UNIT(gemmOutputPtr, im2ColCurPtr, weightDataPtr, biasDataPtr, kernelCountUnit, - ARMV82_CHANNEL_UNIT * DST_XUNIT * sizeof(FLOAT16), ocDiv8, mRelu, mRelu6, - realDstCount); - for (int z = 0; z < ocDiv8; ++z) { - auto outputz = outputCurTilePtr + z * dstZStep; - auto srcz = gemmOutputPtr + z * ARMV82_CHANNEL_UNIT * DST_XUNIT; - memcpy(outputz, srcz, realDstCount * ARMV82_CHANNEL_UNIT * sizeof(FLOAT16)); - } - } - } - }; - - MNN_CONCURRENCY_BEGIN(tId, mThreadNums) - threadFunction((int)tId); -#ifdef MNN_USE_THREAD_POOL - MNN_CONCURRENCY_END(); -#else - MNN_CONCURRENCY_END(); -#endif - } - - return NO_ERROR; -} - -class Arm82ConvolutionCreator : public Arm82Backend::Arm82Creator { - virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, - const MNN::Op *op, Backend *backend) const override { - auto convParam = op->main_as_Convolution2D(); - // avoid other quantize method entry this creator - if(convParam->quanParameter() && convParam->quanParameter()->type() != 3){ - return nullptr; - } - -#ifdef __aarch64__ - const auto param = convParam->common(); - if (param->kernelX() == 3 && param->kernelY() == 3 && param->strideX() == 1 && param->strideY() == 1 && - param->dilateX() == 1 && param->dilateY() == 1) { - return new Arm82Convolution3x3(convParam, backend); - } -#endif - return new Arm82Convolution(convParam, backend); - } -}; - -REGISTER_ARM82_OP_CREATOR(OpType_Convolution, Arm82ConvolutionCreator); - -} // namespace MNN - -#endif \ No newline at end of file diff --git a/source/backend/arm82/Arm82Convolution.hpp b/source/backend/arm82/Arm82Convolution.hpp deleted file mode 100644 index 742292d6..00000000 --- a/source/backend/arm82/Arm82Convolution.hpp +++ /dev/null @@ -1,40 +0,0 @@ -// -// Arm82Convolution.hpp -// MNN -// -// Created by MNN on 2020/01/07. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ -#ifndef Arm82Convolution_hpp -#define Arm82Convolution_hpp - -#include "core/ConvolutionCommon.hpp" -#include "core/Execution.hpp" - -namespace MNN { -class Arm82Convolution : public Execution { -public: - Arm82Convolution(const MNN::Convolution2D *convParam, Backend *bn); - virtual ~Arm82Convolution(); - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - // plane tile number - int mTileCount; - int mThreadNums; - bool mRelu; - bool mRelu6; - ConvolutionCommon::Im2ColParameter mIm2ColParamter; - std::shared_ptr mWeightFp16; - std::shared_ptr mBiasFp16; - - Tensor mIm2ColBuffer; - Tensor mRemainBuffer; - const Convolution2DCommon *mCommon; -}; -} // namespace MNN - -#endif /* Arm82Convolution_hpp */ -#endif diff --git a/source/backend/arm82/Arm82Convolution3x3.cpp b/source/backend/arm82/Arm82Convolution3x3.cpp deleted file mode 100644 index c3f0b01f..00000000 --- a/source/backend/arm82/Arm82Convolution3x3.cpp +++ /dev/null @@ -1,537 +0,0 @@ -// -// Arm82Convolution3x3.cpp -// MNN -// -// Created by MNN on 2020/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "backend/arm82/Arm82Convolution3x3.hpp" -#include "backend/arm82/Arm82OptFunc.hpp" -#include "core/Concurrency.h" -#include "core/Macro.h" -#include "core/TensorUtils.hpp" -#include "core/ConvolutionCommon.hpp" - -#ifdef MNN_USE_NEON -#include -#endif - -constexpr int CONV3X3_WINO_OUT = 4; -constexpr int CONV3X3_WINO_KER = 3; -constexpr int CONV3X3_WINO_IN = CONV3X3_WINO_OUT + CONV3X3_WINO_KER - 1; -constexpr int CONV3X3_WEIGHT_UNIT = CONV3X3_WINO_IN * CONV3X3_WINO_IN * ARMV82_CHANNEL_UNIT; - -constexpr int CONV3X3_WINO_TILE = 8; -constexpr int CONV3X3_WINO_SRC_NUM = CONV3X3_WINO_IN * CONV3X3_WINO_IN * ARMV82_CHANNEL_UNIT; - -namespace MNN { - -// winograd F(4,3) -#ifdef MNN_USE_NEON -static void kernelTransform_wino_4x4_3x3(const FLOAT16* src, FLOAT16* dst, int step) { - FLOAT16 midResult6X3[6][3]; - - for (int i = 0; i < CONV3X3_WINO_KER; ++i) { - FLOAT16 a0i = src[i]; - FLOAT16 a1i = src[1 * CONV3X3_WINO_KER + i]; - FLOAT16 a2i = src[2 * CONV3X3_WINO_KER + i]; - - midResult6X3[0][i] = 0.25f * a0i; - midResult6X3[1][i] = (a0i + a1i + a2i) * -0.1666666666666667f; - midResult6X3[2][i] = (a0i - a1i + a2i) * -0.1666666666666667f; - midResult6X3[3][i] = a0i * 0.04166667f + a1i * 0.08333333f + a2i * 0.1666666666666667f; - midResult6X3[4][i] = a0i * 0.04166667f - a1i * 0.08333333f + a2i * 0.1666666666666667f; - midResult6X3[5][i] = a2i; - } - - for (int i = 0; i < CONV3X3_WINO_IN; ++i) { - auto curRowDst = dst; - curRowDst[0 * step] = 0.25f * midResult6X3[i][0]; - curRowDst[1 * step] = (midResult6X3[i][0] + midResult6X3[i][1] + midResult6X3[i][2]) * -0.1666666666666667f; - curRowDst[2 * step] = (midResult6X3[i][0] - midResult6X3[i][1] + midResult6X3[i][2]) * -0.1666666666666667f; - curRowDst[3 * step] = midResult6X3[i][0] * 0.04166667f + midResult6X3[i][1] * 0.08333333f + - midResult6X3[i][2] * 0.1666666666666667f; - curRowDst[4 * step] = midResult6X3[i][0] * 0.04166667f - midResult6X3[i][1] * 0.08333333f + - midResult6X3[i][2] * 0.1666666666666667f; - curRowDst[5 * step] = midResult6X3[i][2]; - dst += CONV3X3_WINO_IN * step; - } -} - -static void sourceTransform_wino_4x4_3x3(const FLOAT16* src, FLOAT16* dst, int step) { - FLOAT16 midResult[6][6][ARMV82_CHANNEL_UNIT]; - - float16x8_t value_4 = vmovq_n_f16(4); - float16x8_t value_neg_5 = vmovq_n_f16(-5); - float16x8_t value_neg_4 = vmovq_n_f16(-4); - float16x8_t value_2 = vmovq_n_f16(2); - - for (int i = 0; i < CONV3X3_WINO_IN; ++i) { - float16x8_t a0i = vld1q_f16(src + (0 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT); - float16x8_t a1i = vld1q_f16(src + (1 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT); - float16x8_t a2i = vld1q_f16(src + (2 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT); - float16x8_t a3i = vld1q_f16(src + (3 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT); - float16x8_t a4i = vld1q_f16(src + (4 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT); - float16x8_t a5i = vld1q_f16(src + (5 * CONV3X3_WINO_IN + i) * ARMV82_CHANNEL_UNIT); - - float16x8_t b0 = vfmaq_f16(a4i, a2i, value_neg_4); - float16x8_t b1 = vfmaq_f16(a3i, a1i, value_neg_4); - float16x8_t b2 = vsubq_f16(a4i, a2i); - float16x8_t b3 = vmulq_f16(vsubq_f16(a3i, a1i), value_2); - float16x8_t b4 = vfmaq_f16(a4i, a0i, value_4); - float16x8_t b5 = vfmaq_f16(a5i, a1i, value_4); - - float16x8_t r0 = vfmaq_f16(b4, value_neg_5, a2i); - float16x8_t r1 = vaddq_f16(b0, b1); - float16x8_t r2 = vsubq_f16(b0, b1); - float16x8_t r3 = vaddq_f16(b2, b3); - float16x8_t r4 = vsubq_f16(b2, b3); - float16x8_t r5 = vfmaq_f16(b5, value_neg_5, a3i); - - vst1q_f16(midResult[0][i], r0); - vst1q_f16(midResult[1][i], r1); - vst1q_f16(midResult[2][i], r2); - vst1q_f16(midResult[3][i], r3); - vst1q_f16(midResult[4][i], r4); - vst1q_f16(midResult[5][i], r5); - } - - for (int i = 0; i < CONV3X3_WINO_IN; ++i) { - float16x8_t a0i = vld1q_f16(midResult[i][0]); - float16x8_t a1i = vld1q_f16(midResult[i][1]); - float16x8_t a2i = vld1q_f16(midResult[i][2]); - float16x8_t a3i = vld1q_f16(midResult[i][3]); - float16x8_t a4i = vld1q_f16(midResult[i][4]); - float16x8_t a5i = vld1q_f16(midResult[i][5]); - - float16x8_t b0 = vfmaq_f16(a4i, a2i, value_neg_4); - float16x8_t b1 = vfmaq_f16(a3i, a1i, value_neg_4); - float16x8_t b2 = vsubq_f16(a4i, a2i); - float16x8_t b3 = vmulq_f16(vsubq_f16(a3i, a1i), value_2); - float16x8_t b4 = vfmaq_f16(a4i, a0i, value_4); - float16x8_t b5 = vfmaq_f16(a5i, a1i, value_4); - - float16x8_t r0 = vfmaq_f16(b4, value_neg_5, a2i); - float16x8_t r1 = vaddq_f16(b0, b1); - float16x8_t r2 = vsubq_f16(b0, b1); - float16x8_t r3 = vaddq_f16(b2, b3); - float16x8_t r4 = vsubq_f16(b2, b3); - float16x8_t r5 = vfmaq_f16(b5, value_neg_5, a3i); - - vst1q_f16(dst + 0 * step, r0); - vst1q_f16(dst + 1 * step, r1); - vst1q_f16(dst + 2 * step, r2); - vst1q_f16(dst + 3 * step, r3); - vst1q_f16(dst + 4 * step, r4); - vst1q_f16(dst + 5 * step, r5); - dst += CONV3X3_WINO_IN * step; - } -} - -static void dstTransform_wino_4x4_3x3(const FLOAT16* src, const FLOAT16* bias, bool relu, bool relu6, FLOAT16* dst, - int step) { - FLOAT16 midResult[4][6][ARMV82_CHANNEL_UNIT]; - - float16x8_t value_0 = vmovq_n_f16(0); - float16x8_t value_6 = vmovq_n_f16(6); - float16x8_t value_2 = vmovq_n_f16(2); - float16x8_t value_4 = vmovq_n_f16(4); - float16x8_t value_8 = vmovq_n_f16(8); - - float16x8_t value_bias = vld1q_f16(bias); - - for (int i = 0; i < CONV3X3_WINO_IN; ++i) { - float16x8_t a0i = vld1q_f16(src + (CONV3X3_WINO_IN * 0 + i) * step); - float16x8_t a1i = vld1q_f16(src + (CONV3X3_WINO_IN * 1 + i) * step); - float16x8_t a2i = vld1q_f16(src + (CONV3X3_WINO_IN * 2 + i) * step); - float16x8_t a3i = vld1q_f16(src + (CONV3X3_WINO_IN * 3 + i) * step); - float16x8_t a4i = vld1q_f16(src + (CONV3X3_WINO_IN * 4 + i) * step); - float16x8_t a5i = vld1q_f16(src + (CONV3X3_WINO_IN * 5 + i) * step); - - float16x8_t b0 = vaddq_f16(a1i, a2i); - float16x8_t b1 = vaddq_f16(a3i, a4i); - float16x8_t b2 = vsubq_f16(a1i, a2i); - float16x8_t b3 = vsubq_f16(a3i, a4i); - - float16x8_t r0 = vaddq_f16(vaddq_f16(b0, b1), a0i); - float16x8_t r1 = vfmaq_f16(b2, b3, value_2); - float16x8_t r2 = vfmaq_f16(b0, b1, value_4); - float16x8_t r3 = vaddq_f16(a5i, vfmaq_f16(b2, b3, value_8)); - - vst1q_f16(midResult[0][i], r0); - vst1q_f16(midResult[1][i], r1); - vst1q_f16(midResult[2][i], r2); - vst1q_f16(midResult[3][i], r3); - } - - for (int i = 0; i < CONV3X3_WINO_OUT; ++i) { - float16x8_t a0i = vld1q_f16(midResult[i][0]); - float16x8_t a1i = vld1q_f16(midResult[i][1]); - float16x8_t a2i = vld1q_f16(midResult[i][2]); - float16x8_t a3i = vld1q_f16(midResult[i][3]); - float16x8_t a4i = vld1q_f16(midResult[i][4]); - float16x8_t a5i = vld1q_f16(midResult[i][5]); - - float16x8_t b0 = vaddq_f16(a1i, a2i); - float16x8_t b1 = vaddq_f16(a3i, a4i); - float16x8_t b2 = vsubq_f16(a1i, a2i); - float16x8_t b3 = vsubq_f16(a3i, a4i); - - float16x8_t r0 = vaddq_f16(vaddq_f16(b0, b1), a0i); - float16x8_t r1 = vfmaq_f16(b2, b3, value_2); - float16x8_t r2 = vfmaq_f16(b0, b1, value_4); - float16x8_t r3 = vaddq_f16(a5i, vfmaq_f16(b2, b3, value_8)); - - r0 = vaddq_f16(r0, value_bias); - r1 = vaddq_f16(r1, value_bias); - r2 = vaddq_f16(r2, value_bias); - r3 = vaddq_f16(r3, value_bias); - - if (relu) { - r0 = vmaxq_f16(r0, value_0); - r1 = vmaxq_f16(r1, value_0); - r2 = vmaxq_f16(r2, value_0); - r3 = vmaxq_f16(r3, value_0); - } - if (relu6) { - r0 = vmaxq_f16(r0, value_0); - r1 = vmaxq_f16(r1, value_0); - r2 = vmaxq_f16(r2, value_0); - r3 = vmaxq_f16(r3, value_0); - r0 = vminq_f16(r0, value_6); - r1 = vminq_f16(r1, value_6); - r2 = vminq_f16(r2, value_6); - r3 = vminq_f16(r3, value_6); - } - - vst1q_f16(dst + 0 * ARMV82_CHANNEL_UNIT, r0); - vst1q_f16(dst + 1 * ARMV82_CHANNEL_UNIT, r1); - vst1q_f16(dst + 2 * ARMV82_CHANNEL_UNIT, r2); - vst1q_f16(dst + 3 * ARMV82_CHANNEL_UNIT, r3); - dst += CONV3X3_WINO_OUT * ARMV82_CHANNEL_UNIT; - } -} - -#endif - -Arm82Convolution3x3::Arm82Convolution3x3(const MNN::Convolution2D* convParam, Backend* bn) : Execution(bn) { - const auto commonParam = convParam->common(); - mCommon = commonParam; - int inputChannel = commonParam->inputCount(); - const int outputChannel = commonParam->outputCount(); - - if (inputChannel == 0) { - if (convParam->quanParameter()) { - inputChannel = convParam->quanParameter()->buffer()->size() / (2 * 9 * outputChannel); - } else { - inputChannel = convParam->weight()->size() / (9 * outputChannel); - } - } - - const int icDiv8 = UP_DIV(inputChannel, ARMV82_CHANNEL_UNIT); - const int ocDiv8 = UP_DIV(outputChannel, ARMV82_CHANNEL_UNIT); - mRelu = mCommon->relu(); - mRelu6 = mCommon->relu6(); - // transform weight - { - mWeightFp16.reset( - Tensor::createDevice({icDiv8 * ocDiv8 * CONV3X3_WEIGHT_UNIT * ARMV82_CHANNEL_UNIT})); - mValid = bn->onAcquireBuffer(mWeightFp16.get(), Backend::STATIC); - if (!mValid) { - return; - } - - memset(mWeightFp16->host(), 0, mWeightFp16->size()); - - // Set source size align avoid of heap error - std::vector weightFp16(ocDiv8 * ARMV82_CHANNEL_UNIT * inputChannel * CONV3X3_WINO_KER * CONV3X3_WINO_KER, 0); - const FLOAT16* fp16WeightPtr = weightFp16.data(); - if (convParam->quanParameter()) { - MNN_ASSERT((convParam->quanParameter()->type() == 3) || (convParam->quanParameter()->type() == 4)); - if (convParam->quanParameter()->type() == 3) { - // the data type of weight is fp16 - ::memcpy(weightFp16.data(), convParam->quanParameter()->buffer()->data(), convParam->quanParameter()->buffer()->size()); - } - if (convParam->quanParameter()->type() == 4) { - std::shared_ptr quanCommon; - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), true); - int weightCount = convParam->quanParameter()->buffer()->size(); - MNNQuantizeFP16(weightFp16.data(), quanCommon->weightFloat.get(), weightCount); - } - } else { - // the data type of weight is fp32, then quantize weight to be fp16 data type - int size = convParam->weight()->size(); - MNNQuantizeFP16(weightFp16.data(), convParam->weight()->data(), size); - } - - const auto srcWeightPtr = fp16WeightPtr; - auto dstWeightPtr = mWeightFp16->host(); - - auto transformWeight = [&](int ocUnit, int ocStart, int ocEnd, FLOAT16* weight) { - for (int oc = ocStart; oc < ocEnd; ++oc) { - const int oci = oc / ocUnit; - const int ocj = oc % ocUnit; - const auto srcWeightOcPtr = srcWeightPtr + oc * inputChannel * CONV3X3_WINO_KER * CONV3X3_WINO_KER; - auto dstWeightOcPtr = weight + oci * icDiv8 * ARMV82_CHANNEL_UNIT * ocUnit + ocj; - for (int ic = 0; ic < inputChannel; ++ic) { - const auto srcWeightIcPtr = srcWeightOcPtr + ic * CONV3X3_WINO_KER * CONV3X3_WINO_KER; - auto dstWeightIcPtr = dstWeightOcPtr + ic * ocUnit; - - kernelTransform_wino_4x4_3x3(srcWeightIcPtr, dstWeightIcPtr, - icDiv8 * ocDiv8 * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT); - } - } - }; - - const int ocDivDoubleUnit = ocDiv8 / 2; - if (ocDivDoubleUnit > 0) { - transformWeight((ARMV82_CHANNEL_UNIT * 2), 0, ocDivDoubleUnit * (ARMV82_CHANNEL_UNIT * 2), dstWeightPtr); - } - if (ocDiv8 % 2 == 1) { - transformWeight(ARMV82_CHANNEL_UNIT, ocDivDoubleUnit * (ARMV82_CHANNEL_UNIT * 2), outputChannel, - dstWeightPtr); - } - } - - mBiasFp16.reset(Tensor::createDevice({ocDiv8 * ARMV82_CHANNEL_UNIT})); - mValid = bn->onAcquireBuffer(mBiasFp16.get(), Backend::STATIC); - if (!mValid) { - return; - } - - // TODO, bias is fp32, save bias also in fp16? - auto biasDstPtr = mBiasFp16->host(); - memset(biasDstPtr, 0, mBiasFp16->size()); - MNNQuantizeFP16(biasDstPtr, convParam->bias()->data(), outputChannel); -} - -Arm82Convolution3x3::~Arm82Convolution3x3() { - if (nullptr != mWeightFp16) { - backend()->onReleaseBuffer(mWeightFp16.get(), Backend::STATIC); - } - if (nullptr != mBiasFp16) { - backend()->onReleaseBuffer(mBiasFp16.get(), Backend::STATIC); - } -} - -ErrorCode Arm82Convolution3x3::onResize(const std::vector& inputs, const std::vector& outputs) { - auto input = inputs[0]; - auto output = outputs[0]; - - mPadX = mCommon->padX(); - mPadY = mCommon->padY(); - if (mCommon->padMode() == PadMode_SAME) { - int kernelWidthSize = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1; - int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1; - - int padNeededWidth = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width(); - int padNeededHeight = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height(); - mPadX = padNeededWidth / 2; - mPadY = padNeededHeight / 2; - } - - mThreadNums = std::max(static_cast(backend())->numberThread(), 1); - mTransformBuffer.buffer().dimensions = 4; - mTransformBuffer.setType(DataType_DT_BFLOAT16); - mTransformBuffer.setLength(0, mThreadNums); - mTransformBuffer.setLength(1, CONV3X3_WINO_TILE); - mTransformBuffer.setLength( - 2, UP_DIV(input->channel(), ARMV82_CHANNEL_UNIT) + UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT) + 1); - mTransformBuffer.setLength(3, CONV3X3_WINO_SRC_NUM); - TensorUtils::setLinearLayout(&mTransformBuffer); - - bool allocSuccess = backend()->onAcquireBuffer(&mTransformBuffer, Backend::DYNAMIC); - if (!allocSuccess) { - return OUT_OF_MEMORY; - } - - mDummyBias.buffer().dimensions = 1; - mDummyBias.setType(DataType_DT_BFLOAT16); - mDummyBias.setLength(0, UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT) * ARMV82_CHANNEL_UNIT); - allocSuccess = backend()->onAcquireBuffer(&mDummyBias, Backend::DYNAMIC); - if (!allocSuccess) { - return OUT_OF_MEMORY; - } - - backend()->onReleaseBuffer(&mTransformBuffer, Backend::DYNAMIC); - backend()->onReleaseBuffer(&mDummyBias, Backend::DYNAMIC); - return NO_ERROR; -} - -ErrorCode Arm82Convolution3x3::onExecute(const std::vector& inputs, const std::vector& outputs) { - auto input = inputs[0]; - auto output = outputs[0]; - const int batch = input->batch(); - const int ih = input->height(); - const int iw = input->width(); - const int ihw = ih * iw; - const int icDiv8 = UP_DIV(input->channel(), ARMV82_CHANNEL_UNIT); - const int oh = output->height(); - const int ow = output->width(); - const int ohw = oh * ow; - const int ocDiv8 = UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT); - - const int hUnit = UP_DIV(oh, CONV3X3_WINO_OUT); - const int wUnit = UP_DIV(ow, CONV3X3_WINO_OUT); - - const int hPadded = hUnit * CONV3X3_WINO_OUT - oh; - const int wPadded = wUnit * CONV3X3_WINO_OUT - ow; - - const int outUnitCount = hUnit * wUnit; - const int tileCount = UP_DIV(outUnitCount, CONV3X3_WINO_TILE); - - const auto weightPtr = mWeightFp16->host(); - const auto biasDummyPtr = mDummyBias.host(); - const auto biasPtr = mBiasFp16->host(); - - memset(mDummyBias.host(), 0, mDummyBias.size()); - - auto srcGetAndTransformFunc = [=](int xIndex, int realTile, const FLOAT16* srcOrigin, FLOAT16* transformedBuffer, - FLOAT16* tempBuffer) { - memset(tempBuffer, 0, CONV3X3_WINO_TILE * CONV3X3_WINO_SRC_NUM * sizeof(FLOAT16)); - for (int tindex = 0; tindex < realTile; ++tindex) { - int index = xIndex + tindex; - int hindex = index / wUnit; - int windex = index % wUnit; - - int srcX = windex * CONV3X3_WINO_OUT - mPadX; - int srcY = hindex * CONV3X3_WINO_OUT - mPadY; - int sy = ALIMAX(0, srcY) - srcY; - int ey = ALIMIN(srcY + CONV3X3_WINO_IN, ih) - srcY; - int sx = ALIMAX(0, srcX) - srcX; - int ex = ALIMIN(srcX + CONV3X3_WINO_IN, iw) - srcX; - - const auto srcStart = srcOrigin + (srcX + srcY * iw) * ARMV82_CHANNEL_UNIT; - auto curTransPtr = transformedBuffer + tindex * ARMV82_CHANNEL_UNIT; - auto curTempBuffer = tempBuffer + tindex * CONV3X3_WINO_SRC_NUM; - - for (int c = 0; c < icDiv8; ++c) { - const auto curChannelSrcPtr = srcStart + c * ihw * ARMV82_CHANNEL_UNIT; - auto curChannelTransPtr = curTransPtr + c * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT; - if (ex > sx) { - for (int yy = sy; yy < ey; ++yy) { - const auto srcPtr = curChannelSrcPtr + yy * iw * ARMV82_CHANNEL_UNIT; - auto dstPtr = curTempBuffer + yy * CONV3X3_WINO_IN * ARMV82_CHANNEL_UNIT; - - memcpy(dstPtr + ARMV82_CHANNEL_UNIT * sx, srcPtr + ARMV82_CHANNEL_UNIT * sx, - (ex - sx) * sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); - } - } - - sourceTransform_wino_4x4_3x3(curTempBuffer, curChannelTransPtr, - ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * icDiv8); - } - } - - // shuffel channel - if (realTile > (CONV3X3_WINO_TILE / 2)) { - MNNShuffleChannelC8(transformedBuffer, transformedBuffer, - (size_t)(icDiv8 * CONV3X3_WINO_IN * CONV3X3_WINO_IN), 0); - } else { - for (int i = 0; i < CONV3X3_WINO_IN * CONV3X3_WINO_IN; ++i) { - auto dst = transformedBuffer + i * ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * icDiv8; - MNNShuffleChannelC8(dst, dst, (size_t)(icDiv8), 1); - } - } - }; - - auto dstTransformAndSave = [=](int xIndex, int realTile, const FLOAT16* transformedBuffer, const FLOAT16* bias, - bool relu, bool relu6, FLOAT16* dstOrigin, FLOAT16* tempBuffer) { - for (int tindex = 0; tindex < realTile; ++tindex) { - int index = xIndex + tindex; - int hindex = index / wUnit; - int windex = index % wUnit; - int dstX = windex * CONV3X3_WINO_OUT; - int dstY = hindex * CONV3X3_WINO_OUT; - - const auto curTransPtr = transformedBuffer + tindex * ARMV82_CHANNEL_UNIT; - auto dstStartPtr = dstOrigin + (dstX + dstY * ow) * ARMV82_CHANNEL_UNIT; - auto curTempBuffer = tempBuffer + tindex * CONV3X3_WINO_SRC_NUM; - - int hReamin = CONV3X3_WINO_OUT; - int wReamin = CONV3X3_WINO_OUT; - - if (hindex == (hUnit - 1)) { - hReamin = CONV3X3_WINO_OUT - hPadded; - } - if (windex == (wUnit - 1)) { - wReamin = CONV3X3_WINO_OUT - wPadded; - } - - for (int z = 0; z < ocDiv8; ++z) { - const auto curChannelTransPtr = curTransPtr + z * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT; - auto dstZ = dstStartPtr + z * ohw * ARMV82_CHANNEL_UNIT; - - dstTransform_wino_4x4_3x3(curChannelTransPtr, bias + z * ARMV82_CHANNEL_UNIT, relu, relu6, - curTempBuffer, ocDiv8 * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT); - - // save 4x4 outputs from tempBuffer - for (int i = 0; i < hReamin; ++i) { - memcpy(dstZ + i * ow * ARMV82_CHANNEL_UNIT, - curTempBuffer + i * CONV3X3_WINO_OUT * ARMV82_CHANNEL_UNIT, - sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT * wReamin); - } - } - } - }; - - auto threadFunction = [&](size_t tId, size_t tileStart, int tileStep, int tileEnd, const FLOAT16* srcOrigin, - FLOAT16* dstOrigin) { - auto curThreadTransformPtr = mTransformBuffer.host() + tId * mTransformBuffer.stride(0); - auto srcTransformedPtr = curThreadTransformPtr; - auto dstTransformedPtr = curThreadTransformPtr + CONV3X3_WINO_TILE * CONV3X3_WINO_SRC_NUM * icDiv8; - auto tempBufferPtr = curThreadTransformPtr + CONV3X3_WINO_TILE * CONV3X3_WINO_SRC_NUM * (icDiv8 + ocDiv8); - - for (size_t tindex = tileStart; tindex < tileEnd; tindex += tileStep) { - int xIndex = (int)tindex * CONV3X3_WINO_TILE; - int xRemain = outUnitCount - xIndex; - int realTileNum = xRemain > CONV3X3_WINO_TILE ? CONV3X3_WINO_TILE : xRemain; - - srcGetAndTransformFunc(xIndex, realTileNum, srcOrigin, srcTransformedPtr, tempBufferPtr); - - // matmul - for (int i = 0; i < CONV3X3_WINO_IN * CONV3X3_WINO_IN; ++i) { - MNNGemmFP16C8_UNIT(dstTransformedPtr + i * ocDiv8 * CONV3X3_WINO_TILE * ARMV82_CHANNEL_UNIT, - srcTransformedPtr + i * ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * icDiv8, - weightPtr + i * icDiv8 * ocDiv8 * ARMV82_CHANNEL_UNIT * ARMV82_CHANNEL_UNIT, - biasDummyPtr, icDiv8, ARMV82_CHANNEL_UNIT * CONV3X3_WINO_TILE * sizeof(FLOAT16), - ocDiv8, 0, 0, realTileNum); - } - - dstTransformAndSave(xIndex, realTileNum, dstTransformedPtr, biasPtr, mRelu, mRelu6, dstOrigin, - tempBufferPtr); - } - }; - - const auto srcOriginPtr = input->host(); - auto dstOriginPtr = output->host(); - const int inBatchStride = icDiv8 * ihw * ARMV82_CHANNEL_UNIT; - const int outBatchStride = ocDiv8 * ohw * ARMV82_CHANNEL_UNIT; - for (int bIndex = 0; bIndex < batch; ++bIndex) { - const auto curSrcBatchPtr = srcOriginPtr + bIndex * inBatchStride; - auto curDstBatchPtr = dstOriginPtr + bIndex * outBatchStride; - - if (tileCount >= mThreadNums) { - MNN_CONCURRENCY_BEGIN(tId, mThreadNums) - threadFunction((int)tId, (int)tId, mThreadNums, (tileCount / mThreadNums) * mThreadNums, curSrcBatchPtr, - curDstBatchPtr); -#ifdef MNN_USE_THREAD_POOL - MNN_CONCURRENCY_END(); -#else - MNN_CONCURRENCY_END(); -#endif - } - if (tileCount % mThreadNums != 0) { - threadFunction(0, (tileCount / mThreadNums) * mThreadNums, 1, tileCount, curSrcBatchPtr, curDstBatchPtr); - } - } - - return NO_ERROR; -} - -} // namespace MNN - -#endif diff --git a/source/backend/arm82/Arm82Convolution3x3.hpp b/source/backend/arm82/Arm82Convolution3x3.hpp deleted file mode 100644 index 41a787c0..00000000 --- a/source/backend/arm82/Arm82Convolution3x3.hpp +++ /dev/null @@ -1,43 +0,0 @@ -// -// Arm82Convolution3x3.hpp -// MNN -// -// Created by MNN on 2020/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ - -#ifndef Arm82Convolution3x3_hpp -#define Arm82Convolution3x3_hpp - -#include "backend/arm82/Arm82Backend.hpp" -#include "core/Execution.hpp" - -namespace MNN { -class Arm82Convolution3x3 : public Execution { -public: - Arm82Convolution3x3(const MNN::Convolution2D *convParam, Backend *bn); - virtual ~Arm82Convolution3x3(); - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - int mTileCount; - int mThreadNums; - int mPadX; - int mPadY; - bool mRelu; - bool mRelu6; - std::shared_ptr mWeightFp16; - std::shared_ptr mBiasFp16; - - Tensor mTransformBuffer; - Tensor mDummyBias; - const Convolution2DCommon *mCommon; -}; - -} // namespace MNN - -#endif - -#endif diff --git a/source/backend/arm82/Arm82ConvolutionDepthwise.cpp b/source/backend/arm82/Arm82ConvolutionDepthwise.cpp deleted file mode 100644 index aa70afd6..00000000 --- a/source/backend/arm82/Arm82ConvolutionDepthwise.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// Arm82ConvolutionDepthwise.cpp -// MNN -// -// Created by MNN on 2020/01/07. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ -#include "backend/arm82/Arm82ConvolutionDepthwise.hpp" -#include "core/Concurrency.h" -#include "core/Macro.h" -#include "backend/arm82/Arm82OptFunc.hpp" -#include "core/ConvolutionCommon.hpp" - -#ifdef MNN_USE_NEON -#include -#endif - -extern "C" { -void MNNLineDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias_z, - size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, - size_t dilateY_step, size_t relu, size_t relu6); -} - -namespace MNN { - -static void MNNDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias, - size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step, size_t dilateY_step, - size_t relu, size_t relu6) { - int fx, fy; - -#ifdef MNN_USE_NEON - float16x8_t acc_value = vld1q_f16(bias); -#else - FLOAT16 acc_value[ARMV82_CHANNEL_UNIT]; - memcpy(acc_value, bias, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); -#endif - - for (fy = 0; fy < fh; ++fy) { - const auto src_y = src + fy * dilateY_step; - const auto weight_y = weight + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - const auto weight_x = weight_y + fx * ARMV82_CHANNEL_UNIT; - const auto src_x = src_y + fx * dilateX_step; - -#ifdef MNN_USE_NEON - float16x8_t src_x_value = vld1q_f16(src_x); - float16x8_t weight_x_value = vld1q_f16(weight_x); - acc_value = vfmaq_f16(acc_value, src_x_value, weight_x_value); -#else - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - acc_value[j] += src_x[j] * weight_x[j]; - } -#endif - } - } - -#ifdef MNN_USE_NEON - if (relu) { - float16x8_t zero_value = vdupq_n_f16(float16_t(0.0)); - acc_value = vmaxq_f16(acc_value, zero_value); - } - if (relu6) { - float16x8_t zero_value = vdupq_n_f16(float16_t(0.0)); - float16x8_t six_value = vdupq_n_f16(float16_t(6.0)); - acc_value = vmaxq_f16(acc_value, zero_value); - acc_value = vminq_f16(acc_value, six_value); - } - vst1q_f16(dst, acc_value); -#else - if (relu) { - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - if (acc_value[j] < 0) { - acc_value[j] = 0; - } - } - } - if (relu6) { - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - if (acc_value[j] < 0) { - acc_value[j] = 0; - } - if (acc_value[j] > 6) { - acc_value[j] = 6.0; - } - } - } - memcpy(dst, acc_value, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); -#endif -} - -#ifndef MNN_USE_NEON -static void MNNLineDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias_z, - size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, - size_t dilateY_step, size_t relu, size_t relu6) { - int dx, fx, fy; - for (dx = 0; dx < width; ++dx) { - auto dst_x = dst + dx * ARMV82_CHANNEL_UNIT; - FLOAT16 dst_temp[ARMV82_CHANNEL_UNIT]; - memcpy(dst_temp, bias_z, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); - - const auto src_z = src + src_w_step * dx; - - for (fy = 0; fy < fh; ++fy) { - const auto src_y = src_z + fy * dilateY_step; - const auto weight_y = weight + fy * fw * ARMV82_CHANNEL_UNIT; - for (fx = 0; fx < fw; ++fx) { - const auto src_x = src_y + fx * dilateX_step; - const auto weight_x = weight_y + fx * ARMV82_CHANNEL_UNIT; - - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - dst_temp[j] += src_x[j] * weight_x[j]; - } - } - } - - if (relu) { - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - if (dst_temp[j] < 0) { - dst_temp[j] = 0; - } - } - } - if (relu6) { - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - if (dst_temp[j] < 0) { - dst_temp[j] = 0; - } - if (dst_temp[j] > 6) { - dst_temp[j] = 6.0; - } - } - } - - memcpy(dst_x, dst_temp, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); - } -} -#endif - -Arm82ConvolutionDepthwise::Arm82ConvolutionDepthwise(const MNN::Convolution2D* convParam, Backend* bn) : Execution(bn) { - const auto commonParam = convParam->common(); - mCommon = commonParam; - mRelu = commonParam->relu(); - mRelu6 = commonParam->relu6(); - const int kx = commonParam->kernelX(); - const int ky = commonParam->kernelY(); - const int kernelSize = kx * ky; - - const int outputChannel = commonParam->outputCount(); - const int ocDivUnit = UP_DIV(outputChannel, ARMV82_CHANNEL_UNIT); - const int weightSizeAlignLen = ocDivUnit * ARMV82_CHANNEL_UNIT * kernelSize; - mWeightFp16.reset(Tensor::createDevice({weightSizeAlignLen})); - auto success = bn->onAcquireBuffer(mWeightFp16.get(), Backend::STATIC); - if (!success) { - mValid = false; - return; - } - auto weightDstPtr = mWeightFp16->host(); - memset(weightDstPtr, 0, weightSizeAlignLen * sizeof(FLOAT16)); - - const FLOAT16* fp16WeightPtr = nullptr; - std::vector weightFp16; - if(convParam->quanParameter()){ - MNN_ASSERT((convParam->quanParameter()->type() == 3) || (convParam->quanParameter()->type() == 4)); - if (convParam->quanParameter()->type() == 3) { - // the data type of weight is fp16 - fp16WeightPtr = reinterpret_cast(convParam->quanParameter()->buffer()->data()); - } - if (convParam->quanParameter()->type() == 4) { - std::shared_ptr quanCommon; - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), true); - int weightCount = convParam->quanParameter()->buffer()->size(); - weightFp16.resize(weightCount); - MNNQuantizeFP16(weightFp16.data(), quanCommon->weightFloat.get(), weightCount); - fp16WeightPtr = weightFp16.data(); - } - } else { - // the data type of weight is fp32, then quantize weight to be fp16 data type - int size = convParam->weight()->size(); - weightFp16.resize(size); - MNNQuantizeFP16(weightFp16.data(), convParam->weight()->data(), size); - fp16WeightPtr = weightFp16.data(); - } - - const auto weightSrcPtr = fp16WeightPtr; - int cur = 0; - for (int dz = 0; dz < outputChannel; ++dz) { - const int dzi = dz / ARMV82_CHANNEL_UNIT; - const int dzj = dz % ARMV82_CHANNEL_UNIT; - - auto dstDz = weightDstPtr + dzi * kernelSize * ARMV82_CHANNEL_UNIT + dzj; - for (int k = 0; k < kernelSize; ++k) { - dstDz[k * ARMV82_CHANNEL_UNIT] = weightSrcPtr[cur++]; - } - } - mBiasFp16.reset(Tensor::createDevice({ocDivUnit * ARMV82_CHANNEL_UNIT})); - success = bn->onAcquireBuffer(mBiasFp16.get(), Backend::STATIC); - if (!success) { - mValid = false; - return; - } - - // TODO, bias is fp32, save bias also in fp16? - auto biasDstPtr = mBiasFp16->host(); - memset(biasDstPtr, 0, mBiasFp16->size()); - - MNNQuantizeFP16(biasDstPtr, convParam->bias()->data(), outputChannel); -} - -Arm82ConvolutionDepthwise::~Arm82ConvolutionDepthwise() { - if (mWeightFp16 != nullptr) { - backend()->onReleaseBuffer(mWeightFp16.get(), Backend::STATIC); - } - if (mBiasFp16 != nullptr) { - backend()->onReleaseBuffer(mBiasFp16.get(), Backend::STATIC); - } -} - -ErrorCode Arm82ConvolutionDepthwise::onResize(const std::vector& inputs, const std::vector& outputs) { - auto input = inputs[0]; - auto output = outputs[0]; - - int padX = mCommon->padX(); - int padY = mCommon->padY(); - - if (mCommon->padMode() == PadMode_SAME) { - int kernelWidthSize = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1; - int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1; - - int padNeededWidth = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width(); - int padNeededHeight = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height(); - padX = padNeededWidth / 2; - padY = padNeededHeight / 2; - } - - const int src_width = input->width(); - const int src_height = input->height(); - const int dst_width = output->width(); - const int dst_height = output->height(); - const int dst_depth_quad = UP_DIV(output->channel(), ARMV82_CHANNEL_UNIT); - const int dst_z_step = dst_width * dst_height * ARMV82_CHANNEL_UNIT; - const int src_z_step = src_width * src_height * ARMV82_CHANNEL_UNIT; - const int dst_y_step = dst_width * ARMV82_CHANNEL_UNIT; - const int src_y_step = src_width * ARMV82_CHANNEL_UNIT; - const int strideY = mCommon->strideY(); - const int strideX = mCommon->strideX(); - const int dilateY = mCommon->dilateY(); - const int dilateX = mCommon->dilateX(); - const int dilateY_step = dilateY * src_width * ARMV82_CHANNEL_UNIT; - const int dilateX_step = dilateX * ARMV82_CHANNEL_UNIT; - const int kernel_height = mCommon->kernelY(); - const int kernel_width = mCommon->kernelX(); - const int weight_z_step = kernel_width * kernel_height * ARMV82_CHANNEL_UNIT; - int l = 0, t = 0, r = dst_width, b = dst_height; - for (; l * strideX - padX < 0; l++) { - // do nothing - } - for (; t * strideY - padY < 0; t++) { - // do nothing - } - for (; (r - 1) * strideX - padX + kernel_width * dilateX > src_width && r > l; r--) { - // do nothing - } - for (; (b - 1) * strideY - padY + kernel_height * dilateY > src_height && b > t; b--) { - // do nothing - } - - const auto weightPtr = mWeightFp16->host(); - const auto biasPtr = mBiasFp16->host(); - const int threadNumber = static_cast(backend())->numberThread(); - mThreadNumber = std::min(threadNumber, dst_depth_quad); - auto runBasic = [=](FLOAT16* dst_z, const FLOAT16* src_z, const FLOAT16* weight_dz, const FLOAT16* bias_z, int L, - int T, int R, int B) { - for (int dy = T; dy < B; ++dy) { - auto dst_y = dst_z + dy * dst_y_step; - const int srcStartY = dy * strideY - padY; - const auto src_y = src_z + srcStartY * src_y_step; - const int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); - const int efy = ALIMIN(kernel_height, (UP_DIV(src_height - srcStartY, dilateY))); - for (int dx = L; dx < R; ++dx) { - auto dst_x = dst_y + ARMV82_CHANNEL_UNIT * dx; - const int srcStartX = dx * strideX - padX; - const auto src_x = src_y + srcStartX * ARMV82_CHANNEL_UNIT; - const int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); - const int efx = ALIMIN(kernel_width, (UP_DIV(src_width - srcStartX, dilateX))); - const int srcIndex = (sfx * dilateX + sfy * dilateY * src_width) * ARMV82_CHANNEL_UNIT; - const int weightIndex = (kernel_width * sfy + sfx) * ARMV82_CHANNEL_UNIT; - - MNNDepthWiseFp16C8Unit(dst_x, src_x + srcIndex, weight_dz + weightIndex, bias_z, efx - sfx, efy - sfy, - ARMV82_CHANNEL_UNIT * kernel_width, dilateX_step, dilateY_step, - (size_t)mRelu, (size_t)mRelu6); - } - } - }; - - mThreadFunction = [=](int tId, const FLOAT16* src, FLOAT16* dst) { - for (int dz = tId; dz < dst_depth_quad; dz += mThreadNumber) { - const auto src_z = src + dz * src_z_step; - const auto weight_dz = weightPtr + dz * weight_z_step; - const auto bias_dz = biasPtr + dz * ARMV82_CHANNEL_UNIT; - auto dst_z = dst + dz * dst_z_step; - runBasic(dst_z, src_z, weight_dz, bias_dz, 0, 0, dst_width, t); - runBasic(dst_z, src_z, weight_dz, bias_dz, 0, b, dst_width, dst_height); - runBasic(dst_z, src_z, weight_dz, bias_dz, 0, t, l, b); - runBasic(dst_z, src_z, weight_dz, bias_dz, r, t, dst_width, b); - if (r > l) { - for (int dy = t; dy < b; ++dy) { - const int srcStartY = dy * strideY - padY; - const auto src_dy = src_z + srcStartY * src_y_step; - auto dst_y = dst_z + dy * dst_y_step; - MNNLineDepthWiseFp16C8Unit( - dst_y + l * ARMV82_CHANNEL_UNIT, src_dy + (l * strideX - padX) * ARMV82_CHANNEL_UNIT, weight_dz, - bias_dz, r - l, strideX * ARMV82_CHANNEL_UNIT, kernel_width, kernel_height, dilateX_step, - dilateY_step, (size_t)mRelu, (size_t)mRelu6); - } - } - } - }; - - return NO_ERROR; -} - -ErrorCode Arm82ConvolutionDepthwise::onExecute(const std::vector& inputs, - const std::vector& outputs) { - - auto input = inputs[0]; - auto output = outputs[0]; - const int batch = input->batch(); - - const int inBatchStride = ROUND_UP(input->channel(), ARMV82_CHANNEL_UNIT) * input->height() * input->width(); - const int outBatchStride = ROUND_UP(output->channel(), ARMV82_CHANNEL_UNIT) * output->height() * output->width(); - - const auto inputPtr = input->host(); - auto outputPtr = output->host(); - - for (int bIndex = 0; bIndex < batch; ++bIndex) { - const auto srcOrigin = inputPtr + bIndex * inBatchStride; - auto dstOrigin = outputPtr + bIndex * outBatchStride; - - MNN_CONCURRENCY_BEGIN(tId, mThreadNumber) - mThreadFunction((int)tId, srcOrigin, dstOrigin); -#ifdef MNN_USE_THREAD_POOL - MNN_CONCURRENCY_END(); -#else - MNN_CONCURRENCY_END(); -#endif - } - return NO_ERROR; -} - -class Arm82ConvolutionDepthwiseCreator : public Arm82Backend::Arm82Creator { - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const override { - return new Arm82ConvolutionDepthwise(op->main_as_Convolution2D(), backend); - } -}; - -REGISTER_ARM82_OP_CREATOR(OpType_ConvolutionDepthwise, Arm82ConvolutionDepthwiseCreator); - -} // namespace MNN - -#endif \ No newline at end of file diff --git a/source/backend/arm82/Arm82ConvolutionDepthwise.hpp b/source/backend/arm82/Arm82ConvolutionDepthwise.hpp deleted file mode 100644 index 8dfca235..00000000 --- a/source/backend/arm82/Arm82ConvolutionDepthwise.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// -// Arm82ConvolutionDepthwise.hpp -// MNN -// -// Created by MNN on 2020/01/07. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ -#ifndef Arm82ConvolutionDepthwise_hpp -#define Arm82ConvolutionDepthwise_hpp - -#include "MNN_generated.h" -#include "backend/arm82/Arm82Backend.hpp" -#include "core/Execution.hpp" - -namespace MNN { -class Arm82ConvolutionDepthwise : public Execution { -public: - Arm82ConvolutionDepthwise(const MNN::Convolution2D *convParam, Backend *bn); - virtual ~Arm82ConvolutionDepthwise(); - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - std::shared_ptr mWeightFp16; - std::shared_ptr mBiasFp16; - const Convolution2DCommon *mCommon; - int mThreadNumber; - bool mRelu; - bool mRelu6; - std::function mThreadFunction; -}; - -} // namespace MNN - -#endif /* Arm82ConvolutionDepthwise_hpp */ - -#endif \ No newline at end of file diff --git a/source/backend/arm82/Arm82Eltwise.cpp b/source/backend/arm82/Arm82Eltwise.cpp index 667eef87..0a057efc 100644 --- a/source/backend/arm82/Arm82Eltwise.cpp +++ b/source/backend/arm82/Arm82Eltwise.cpp @@ -5,17 +5,13 @@ // Created by MNN on 2020/2/13. // Copyright © 2018, Alibaba Group Holding Limited // +#if defined(__ANDROID__) || defined(__aarch64__) -#ifdef __aarch64__ -#include "backend/arm82/Arm82Eltwise.hpp" -#include "backend/arm82/Arm82Backend.hpp" +#include "Arm82Eltwise.hpp" +#include "Arm82Backend.hpp" #include "core/Macro.h" #include "MNN_generated.h" - - -#ifdef MNN_USE_NEON #include -#endif namespace MNN { diff --git a/source/backend/arm82/Arm82Eltwise.hpp b/source/backend/arm82/Arm82Eltwise.hpp index 2c7c810e..8820510e 100644 --- a/source/backend/arm82/Arm82Eltwise.hpp +++ b/source/backend/arm82/Arm82Eltwise.hpp @@ -5,7 +5,8 @@ // Created by MNN on 2020/2/13. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #ifndef Arm82Eltwise_hpp #define Arm82Eltwise_hpp @@ -27,4 +28,4 @@ private: } // namespace MNN #endif /* Arm82Eltwise_hpp */ -#endif \ No newline at end of file +#endif diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp new file mode 100644 index 00000000..fb69b305 --- /dev/null +++ b/source/backend/arm82/Arm82Functions.cpp @@ -0,0 +1,479 @@ +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82Functions.hpp" +#include "Arm82OptFunc.hpp" +#include "Arm82WinogradOptFunc.hpp" +#include "Arm82Vec.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" + +#if defined(MNN_USE_NEON) +#include +#endif + +extern "C" { +// (UP_DIV(l,8), e, 8) -> (UP_DIV(e,eP), l, eP) +void Arm82MNNPackForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el); + +// C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, eP), hP = 24 +// parameter: [aStride, l, h, cStride, bExtraStride] +// aStride in parameter is deprecated (useless), but for code clean, just retain it +void MNNPackedMatMulFP16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); + +// C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, e), hP = 24, e >= 1 +// parameter: [aStride, l, h, cStride, bExtraStride] +void MNNPackedMatMulRemainFP16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); + +void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow); + +void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit); + +void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep); +} + +using Vec = MNN::Math::Vec; + +namespace MNN { + +static void MNNMatrixAddFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t widthC8, size_t cStride, size_t aStride, size_t bStride, size_t height) { + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y, b = B + bStride * y; + auto c = C + cStride * y; + for (int x = 0; x < widthC8; ++x) { + vst1q_f16(c + x * 8, vaddq_f16(vld1q_f16(a + x * 8), vld1q_f16(b + x * 8))); + } + } +} +static void MNNMatrixSubFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t widthC8, size_t cStride, size_t aStride, size_t bStride, size_t height) { + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y, b = B + bStride * y; + auto c = C + cStride * y; + for (int x = 0; x < widthC8; ++x) { + vst1q_f16(c + x * 8, vsubq_f16(vld1q_f16(a + x * 8), vld1q_f16(b + x * 8))); + } + } +} + +static void Arm82MNNPackForMatMul_B(float* destC, const float* sourceC, size_t h, size_t l, bool transpose) { + auto dest = (int16_t*)destC; + auto source = (int16_t*)sourceC; + int ePack, lPack, hPack; + Arm82MNNGetMatMulPackMode(&ePack, &lPack, &hPack); + auto hP = (int)h / hPack; + auto hR = (int)hP * hPack; + if (hR != h) { + ::memset(dest, 0, UP_DIV(h, hPack) * hPack * l * sizeof(FLOAT16)); + } + if (!transpose) { + for (int y = 0; y < hP; ++y) { + auto destY = dest + y * hPack * l; + auto sourceY = source + y * hPack; + for (int x = 0; x < l; ++x) { + ::memcpy(destY + hPack * x, sourceY + x * h, hPack * sizeof(FLOAT16)); + } + } + auto hRemain = h - hR; + if (hRemain > 0) { + auto destY = dest + hP * hPack * l; + auto sourceY = source + hP * hPack; + for (int x = 0; x < l; ++x) { + ::memcpy(destY + hPack * x, sourceY + x * h, hRemain * sizeof(FLOAT16)); + } + } + return; + } + for (int y = 0; y < h; ++y) { + for (int x = 0; x < l; ++x) { + dest[(y / hPack * l + x) * hPack + y % hPack] = source[y * l + x]; + } + } +} + +static void MNNScaleAndAddBiasFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* bias, const FLOAT16* alpha, size_t planeNumber, + size_t biasNumber) { + for (int z = 0; z < biasNumber; ++z) { + FLOAT16* dstZ = dst + planeNumber * 8 * z; + const FLOAT16* srcZ = src + planeNumber * 8 * z; +#ifdef MNN_USE_NEON + auto biasZ = vld1q_f16(bias + 8 * z), alphaZ = vld1q_f16(alpha + 8 * z); +#else + auto biasZ = bias + 8 * z, alphaZ = alpha + 8 * z; +#endif + for (int p = 0; p < planeNumber; ++p) { + FLOAT16* dstX = dstZ + 8 * p; + const FLOAT16* srcX = srcZ + 8 * p; +#ifdef MNN_USE_NEON + auto res = vaddq_f16(vmulq_f16(vld1q_f16(srcX), alphaZ), biasZ); + vst1q_f16(dstX, res); +#else + for (int k = 0; k < 8; ++k) { + dstX[k] = srcX[k] * alphaZ[k] + biasZ[k]; + } +#endif + } + } +} + +static void MNNScaleAndAddBiasOutside(FLOAT16* dst, const FLOAT16* src, const FLOAT16* bias, const FLOAT16* alpha, size_t planeNumber, + size_t biasNumber) { + for (size_t p = 0; p < planeNumber; ++p) { + FLOAT16* dstPlane = dst + p * biasNumber; + const FLOAT16* srcPlane = src + p * biasNumber; + for (int z = 0; z < biasNumber; ++z) { + dstPlane[z] = srcPlane[z] * alpha[z] + bias[z]; + } + } +} + +static void MNNAddBiasFP16(FLOAT16* dst, const FLOAT16* bias, size_t planeNumber, size_t biasNumber) { + using Vec = MNN::Math::Vec; + for (int i = 0; i < biasNumber; ++i) { + auto b = Vec::load(bias + i * 8); + for (int j = 0; j < planeNumber; ++j) { + auto dstPtr = dst + (i * planeNumber + j) * 8; + Vec::save(dstPtr, Vec::load(dstPtr) + b); + } + } +} +static void MNNAddBiasReluFP16(FLOAT16* dst, const FLOAT16* bias, size_t planeNumber, size_t biasNumber) { + using Vec = MNN::Math::Vec; + Vec zero((FLOAT16)0); + for (int i = 0; i < biasNumber; ++i) { + auto b = Vec::load(bias + i * 8); + for (int j = 0; j < planeNumber; ++j) { + auto dstPtr = dst + (i * planeNumber + j) * 8; + auto result = Vec::max(Vec::load(dstPtr) + b, zero); + Vec::save(dstPtr, result); + } + } +} +static void MNNAddBiasRelu6FP16(FLOAT16* dst, const FLOAT16* bias, size_t planeNumber, size_t biasNumber) { + using Vec = MNN::Math::Vec; + Vec zero((FLOAT16)0), six((FLOAT16)6); + for (int i = 0; i < biasNumber; ++i) { + auto b = Vec::load(bias + i * 8); + for (int j = 0; j < planeNumber; ++j) { + auto dstPtr = dst + (i * planeNumber + j) * 8; + auto result = Vec::min(Vec::max(Vec::load(dstPtr) + b, zero), six); + Vec::save(dstPtr, result); + } + } +} + +static void MNNCopyC8WithStrideFP16(const FLOAT16* source, FLOAT16* dest, size_t srcStride, size_t dstStride, size_t count) { + using Vec = MNN::Math::Vec; + for (int i = 0; i < count; ++i) { + auto srcPtr = source + i * srcStride; + auto dstPtr = dest + i * dstStride; + Vec::save(dstPtr, Vec::load(srcPtr)); + } +} + +static void MNNAddC8WithStrideFP16(const FLOAT16* source, FLOAT16* dest, size_t srcStride, size_t dstStride, size_t count) { + using Vec = MNN::Math::Vec; + for (int i = 0; i < count; ++i) { + auto srcPtr = source + i * srcStride; + auto dstPtr = dest + i * dstStride; + auto value = Vec::load(dstPtr) + Vec::load(srcPtr); + Vec::save(dstPtr, value); + } +} + +static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float* BF, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) { + auto C = (FLOAT16*)CF; + auto A = (FLOAT16*)AF; + auto B = (FLOAT16*)BF; + using Vec = MNN::Math::Vec; + auto minF = Vec(parameters[2]); + auto maxF = Vec(parameters[3]); + auto beta = Vec(parameters[1]); + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y; + auto b = B + 8 * y; + auto bv = Vec::load(b); + auto c = C + cStride * y; + for (int x = 0; x < width; ++x) { + auto av = Vec::load(a + 8 * x); + auto cv = av + bv * beta; + cv = Vec::min(cv, maxF); + cv = Vec::max(cv, minF); + Vec::save(c + 8 * x, cv); + } + } +} + +void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow) { + constexpr int pack = 8; + int unit = ow / 2; + MNN_ASSERT(cacheLineSize >= 1); + for (int x = 0; x < unit; ++x) { + int offset = 4 * pack * x, i = 0; + Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); + Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1); + Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); + Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3); + for (i = 1; i < cacheLineSize; ++i) { + m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); + m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1); + m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); + m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3); + } + auto o0 = m0 + m1 + m2; + auto o1 = m1 - m2 + m3; + Vec::save(dest + (2 * x + 0) * pack, o0); + Vec::save(dest + (2 * x + 1) * pack, o1); + } + if (unit * 2 < ow) { + int offset = 4 * pack * unit, i = 0; + Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); + Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack); + Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); + for (i = 1; i < cacheLineSize; ++i) { + m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); + m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack); + m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); + } + auto o0 = m0 + m1 + m2; + Vec::save(dest + 2 * unit * pack, o0); + } +} +// unit: winograd unit (output is w/2) +void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) { + constexpr int pack = 8; // float16x8 + for (int x = 0; x < su; ++x) { + auto dstX = dest + 4 * pack * x; + auto sx = x * 2 - (int)pad; + auto ex = sx + 4; + auto clampSx = std::max(sx, 0); + auto clampEx = std::min(ex, (int)iw); + Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = clampSx; i < clampEx; ++i) { + v[i - sx] = Vec::load(source + pack * i); + } + auto m0 = v[0] - v[2]; + auto m1 = v[1] + v[2]; + auto m2 = v[2] - v[1]; + auto m3 = v[3] - v[1]; + Vec::save(dstX + pack * 0, m0); + Vec::save(dstX + pack * 1, m1); + Vec::save(dstX + pack * 2, m2); + Vec::save(dstX + pack * 3, m3); + } + MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su); + for (int x = eu; x < unit; ++x) { + auto dstX = dest + 4 * pack * x; + auto sx = x * 2 - (int)pad; + auto ex = sx + 4; + auto clampSx = std::max(sx, 0); + auto clampEx = std::min(ex, (int)iw); + Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = clampSx; i < clampEx; ++i) { + v[i - sx] = Vec::load(source + pack * i); + } + auto m0 = v[0] - v[2]; + auto m1 = v[1] + v[2]; + auto m2 = v[2] - v[1]; + auto m3 = v[3] - v[1]; + Vec::save(dstX + pack * 0, m0); + Vec::save(dstX + pack * 1, m1); + Vec::save(dstX + pack * 2, m2); + Vec::save(dstX + pack * 3, m3); + } +} + +void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr, + size_t cStride, size_t eSub, size_t hSub) { + const int pack = 8; + for (int y = 0; y < hSub; ++y) { + auto c11Y = c11 + y * cStride; + auto c12Y = c12 + y * cStride; + auto c22Y = c22 + y * cStride; + auto c21Y = c21 + y * cStride; + auto xY = xAddr + y * eSub * pack; + for (int x = 0; x < eSub; ++x) { + auto xv = vld1q_f16(xY + x * pack); + auto c21v = vld1q_f16(c21Y + x * pack); + auto c11v = vld1q_f16(c11Y + x * pack); + auto c22v = vld1q_f16(c22Y + x * pack); + auto c12v = vld1q_f16(c12Y + x * pack); + c12v = c12v + xv; + c21v = c12v + c21v; + c12v = c22v + c12v; + c22v = c22v + c21v; + c12v = c11v + c12v; + vst1q_f16(c12Y + x * pack, c12v); + vst1q_f16(c22Y + x * pack, c22v); + vst1q_f16(c21Y + x * pack, c21v); + } + } +} + +void MNNUnpackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size_t depth) { + if (1 == area) { + ::memcpy(dst, src, depth * sizeof(int16_t)); + return; + } + int c = (int)depth; + int cDiv4 = c / 8; + int cAlign = cDiv4 * 8; + if (cAlign == c) { + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = src + hi * 8; + auto dstHeight = dst + hi * cDiv4 * 8; + for (int ci = 0; ci < cDiv4; ++ci) { + vst1q_s16(dstHeight + ci * 8, vld1q_s16(srcHeight + 8 * ci * area)); + } + } + return; + } + + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = src + hi * 8; + auto dstHeight = dst + hi * c; + for (int ci = 0; ci < cDiv4; ++ci) { + vst1q_s16(dstHeight + ci * 8, vld1q_s16(srcHeight + 8 * ci * area)); + } + } + + int cReamin = c - cAlign; + auto srcAlign = src + area * cAlign; + auto dstAlign = dst + cAlign; + + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = srcAlign + hi * 8; + auto dstHeight = dstAlign + hi * c; + + for (int ci = 0; ci < cReamin; ++ci) { + dstHeight[ci] = srcHeight[ci]; + } + } +} + +void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size_t depth) { + if (depth == 8) { + ::memcpy(dst, src, area * depth * sizeof(int16_t)); + return; + } + int c = (int)depth; + int cDiv4 = c / 8; + int cAlign = cDiv4 * 8; + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = (src + hi * c); + auto dstHeight = (dst + hi * 8); + for (int ci = 0; ci < cDiv4; ++ci) { + vst1q_s16(dstHeight + ci * area * 8, vld1q_s16(srcHeight + 8 * ci)); + } + } + + if (cAlign == c) { + return; + } + + int cReamin = c - cAlign; + auto srcAlign = src + cAlign; + auto dstAlign = dst + area * cAlign; + + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = srcAlign + hi * c; + auto dstHeight = dstAlign + hi * 8; + for (int i = 0; i < 8; ++i) { + dstHeight[i] = 0; + } + for (int ci = 0; ci < cReamin; ++ci) { + dstHeight[ci] = srcHeight[ci]; + } + } +} + +static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, + size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { + int fx, fy; + Vec dstValue(0.0f); + auto src_z = (const FLOAT16*)src; + auto weight_z = (const FLOAT16*)weight; + for (fy = 0; fy < fh; ++fy) { + auto src_y = src_z + fy * dilateY_step; + auto weight_y = weight_z + fy * weight_y_step; + for (fx = 0; fx < fw; ++fx) { + auto weight_x = weight_y + 8 * fx; + auto src_x = src_y + fx * dilateX_step; + dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x); + } + } + Vec::save((FLOAT16*)dst, dstValue); +} + +static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh, + size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { + int fx, fy; + auto src_z = src; + auto weight_z = weight; + Vec dstV = Vec::load(dst); + for (fy = 0; fy < fh; ++fy) { + auto src_y = src_z + fy * dilateY_step; + auto weight_y = weight_z + fy * weight_y_step; + for (fx = 0; fx < fw; ++fx) { + Vec weight_x = Vec::load(weight_y + 8 * fx); + Vec src_x = Vec::load(src_y + fx * dilateX_step); + Vec::save(src_y + fx * dilateX_step, src_x + weight_x * dstV); + } + } +} +static void _MNNDeconvRunForLineDepthwise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) { + int dx; + for (dx = 0; dx < width; ++dx) { + auto dst_x = dst + dx * 8; + auto src_dx = src + src_w_setup * dx; + _MNNDeconvRunForUnitDepthWise(dst_x, src_dx, weight, fw, fh, fw * 8, dilateX_step, dilateY_step); + } +} + +static CoreFunctions* gInstance = nullptr; +bool Arm82Functions::init() { +#define FUNC_PTR_ASSIGN(dst, src) dst = (decltype(dst))src + gInstance = new CoreFunctions; + FUNC_PTR_ASSIGN(gInstance->MNNFp32ToLowp, MNNQuantizeFP16); + FUNC_PTR_ASSIGN(gInstance->MNNLowpToFp32, MNNDequantizeFP16); + gInstance->bytes = 2; + + // Packed + gInstance->pack = 8; + FUNC_PTR_ASSIGN(gInstance->MNNPackCUnit, MNNPackC8FP16); + FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16); + FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8); + FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8); + FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16); + FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16); + FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16); + FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16); + FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon); + FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon); + FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16); + FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16); + FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge); + gInstance->penalty = 2.0f; + FUNC_PTR_ASSIGN(gInstance->MNNScaleAndAddBias, MNNScaleAndAddBiasFP16); + FUNC_PTR_ASSIGN(gInstance->MNNCopyC4WithStride, MNNCopyC8WithStrideFP16); + FUNC_PTR_ASSIGN(gInstance->MNNAddC4WithStride, MNNAddC8WithStrideFP16); + + // MatMul + FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul, MNNPackedMatMulFP16); + FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain, MNNPackedMatMulRemainFP16); + FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A); + FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode); + FUNC_PTR_ASSIGN(gInstance->MNNPackForMatMul_B, Arm82MNNPackForMatMul_B); + + FUNC_PTR_ASSIGN(gInstance->chooseWinoSourceTransform, Arm82WinogradFunction::chooseSourceTransform); + FUNC_PTR_ASSIGN(gInstance->chooseWinoDestTransform, Arm82WinogradFunction::chooseDestTransform); + + gInstance->MNNDeconvRunForLineDepthwise = (decltype(gInstance->MNNDeconvRunForLineDepthwise))_MNNDeconvRunForLineDepthwise; + gInstance->MNNDeconvRunForUnitDepthWise = (decltype(gInstance->MNNDeconvRunForUnitDepthWise))_MNNDeconvRunForUnitDepthWise; + return true; +} + +CoreFunctions* Arm82Functions::get() { + return gInstance; +} +}; +#endif diff --git a/source/backend/arm82/Arm82Functions.hpp b/source/backend/arm82/Arm82Functions.hpp new file mode 100644 index 00000000..3282af97 --- /dev/null +++ b/source/backend/arm82/Arm82Functions.hpp @@ -0,0 +1,20 @@ +#if defined(__ANDROID__) || defined(__aarch64__) + +#ifndef Arm82Functions_hpp +#define Arm82Functions_hpp +#include +#include +#include +#include "core/Macro.h" +#include "backend/cpu/CPUBackend.hpp" +namespace MNN { +class Arm82Functions { +public: + static bool init(); + static CoreFunctions* get(); +}; + +}; + +#endif // Arm82Functions_hpp +#endif diff --git a/source/backend/arm82/Arm82InstanceNorm.cpp b/source/backend/arm82/Arm82InstanceNorm.cpp new file mode 100644 index 00000000..167efc4c --- /dev/null +++ b/source/backend/arm82/Arm82InstanceNorm.cpp @@ -0,0 +1,107 @@ +// +// Arm82InstanceNorm.hpp +// MNN +// +// Created by MNN on 2019/02/28. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82Backend.hpp" +#include "Arm82OptFunc.hpp" +#include "Arm82InstanceNorm.hpp" +#include "MNN_generated.h" +#include "core/Concurrency.h" +#include +#include +#include "core/Macro.h" +#include "core/TensorUtils.hpp" + +#ifdef MNN_USE_NEON +#include +#endif + +namespace MNN { + +Arm82InstanceNorm::Arm82InstanceNorm(Backend* backend, const MNN::Op* op) : Execution(backend) { + auto normParam = op->main_as_BatchNorm(); + const int channels = normParam->channels(); + mEpsilon = normParam->epsilon(); + mScale.reset(ALIGN_UP8(channels)); + mScale.clear(); + if (normParam->slopeData() && normParam->slopeData()->data()) { + MNNSlowCopy(mScale.get(), normParam->slopeData()->data(), channels); + } + + mBias.reset(ALIGN_UP8(channels)); + mBias.clear(); + if (normParam->biasData() && normParam->biasData()->data()) { + MNNSlowCopy(mBias.get(), normParam->biasData()->data(), channels); + } +} + +ErrorCode Arm82InstanceNorm::onExecute(const std::vector& inputs, const std::vector& outputs) { + MNN_ASSERT(3 == inputs.size()); + MNN_ASSERT(1 == outputs.size()); + + auto input = inputs[0], mean = inputs[1], variance = inputs[2], output = outputs[0]; + const int batch = input->batch(), imageSize = input->stride(1); + auto scalePtr = mScale.get(), biasPtr = mBias.get(); + const int threadNum = ((Arm82Backend*)backend())->numberThread(); + const int channelBlock = UP_DIV(input->channel(), 8); + + for (int b = 0; b < batch; ++b) { + auto inputPtr = input->host() + b * ARM82TensorStrideHelper(input, 0); + auto meanPtr = mean->host() + b * ARM82TensorStrideHelper(mean, 0); + auto variancePtr = variance->host() + b * ARM82TensorStrideHelper(variance, 0); + auto outputPtr = output->host() + b * ARM82TensorStrideHelper(output, 0); + + MNN_CONCURRENCY_BEGIN(tId, threadNum) { + const int step = UP_DIV(channelBlock, threadNum) * 8, start = tId * step, end = ALIMIN(start + step, channelBlock); + for (int c = start; c < end; c += 8) { + auto inputPtrZ = inputPtr + c * imageSize; + auto outputPtrZ = outputPtr + c * imageSize; +#ifdef MNN_USE_NEON + float16x8_t meanVec = vld1q_f16(meanPtr + c), varVec = vld1q_f16(variancePtr + c); + float16x8_t scaleVec = vld1q_f16(scalePtr + c), biasVec = vld1q_f16(biasPtr + c); + float16x8_t epsVec = vdupq_n_f16(mEpsilon), rsqrtVec = vrsqrteq_f16(varVec + epsVec); + + float16x8_t gamma = vmulq_f16(scaleVec, rsqrtVec); + float16x8_t beta = vsubq_f16(biasVec, vmulq_f16(meanVec, gamma)); + for (int i = 0; i < imageSize; ++i) { + float16x8_t in = vld1q_f16(inputPtr + i * 8); + vst1q_f16(outputPtrZ + i * 8, vaddq_f16(vmulq_f16(in, gamma), beta)); + } +#else + FLOAT16 gamma[8], beta[8]; + for (int k = 0; k < 8; ++k) { + int index = c + k; + gamma[k] = scalePtr[index] / sqrt(variancePtr[index] + mEpsilon); + beta[k] = biasPtr[index] - gamma[k] * meanPtr[index]; + } + for (int i = 0; i < imageSize; ++i) { + for (int k = 0; k < 8; ++k) { + outputPtrZ[i * 8 + k] = inputPtrZ[i * 8 + k] * gamma[k] + beta[k]; + } + } +#endif + } + } + MNN_CONCURRENCY_END(); + } + + return NO_ERROR; +} + +class Arm82InstanceNormCreator : public Arm82Backend::Arm82Creator { +public: + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op, Backend* backend) const override { + return new Arm82InstanceNorm(backend, op); + } +}; + +REGISTER_ARM82_OP_CREATOR(OpType_InstanceNorm, Arm82InstanceNormCreator); + +} // namespace MNN +#endif diff --git a/source/backend/arm82/Arm82InstanceNorm.hpp b/source/backend/arm82/Arm82InstanceNorm.hpp new file mode 100644 index 00000000..4fdf4f26 --- /dev/null +++ b/source/backend/arm82/Arm82InstanceNorm.hpp @@ -0,0 +1,33 @@ +// +// Arm82InstanceNorm.hpp +// MNN +// +// Created by MNN on 2019/02/28. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#ifndef Arm82InstanceNorm_hpp +#define Arm82InstanceNorm_hpp + +#include "Arm82Backend.hpp" +#include "core/AutoStorage.h" +#include "core/Execution.hpp" +#include "MNN_generated.h" + +namespace MNN { +class Arm82InstanceNorm : public Execution { +public: + Arm82InstanceNorm(Backend *backend, const MNN::Op *op); + virtual ~Arm82InstanceNorm() = default; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + +private: + AutoStorage mScale; + AutoStorage mBias; + FLOAT16 mEpsilon; +}; +} // namespace MNN + +#endif /* Arm82InstanceNorm_hpp */ +#endif diff --git a/source/backend/arm82/Arm82Interp.cpp b/source/backend/arm82/Arm82Interp.cpp index f642d42d..ddb8530e 100644 --- a/source/backend/arm82/Arm82Interp.cpp +++ b/source/backend/arm82/Arm82Interp.cpp @@ -5,8 +5,9 @@ // Created by MNN on 2020/04/28. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ -#include "backend/arm82/Arm82Interp.hpp" +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82Interp.hpp" #include #include "core/Concurrency.h" #include "core/Macro.h" diff --git a/source/backend/arm82/Arm82Interp.hpp b/source/backend/arm82/Arm82Interp.hpp index d8ab2b2f..ef86071f 100644 --- a/source/backend/arm82/Arm82Interp.hpp +++ b/source/backend/arm82/Arm82Interp.hpp @@ -5,11 +5,12 @@ // Created by MNN on 2020/04/28. // Copyright © 2018, Alibaba Group Holding Limited // +#if defined(__ANDROID__) || defined(__aarch64__) #ifndef CPUInterp_hpp #define CPUInterp_hpp -#include "backend/arm82/Arm82Backend.hpp" +#include "Arm82Backend.hpp" #include "core/AutoStorage.h" #include "core/Execution.hpp" @@ -38,3 +39,4 @@ private: } // namespace MNN #endif +#endif diff --git a/source/backend/arm82/Arm82Moments.cpp b/source/backend/arm82/Arm82Moments.cpp new file mode 100644 index 00000000..a9c8c644 --- /dev/null +++ b/source/backend/arm82/Arm82Moments.cpp @@ -0,0 +1,120 @@ +// +// Arm82Moments.cpp +// MNN +// +// Created by MNN on 2019/02/28. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82Moments.hpp" +#include "Arm82Backend.hpp" +#include "Arm82Vec.hpp" +#include "core/Concurrency.h" +#include +#include "core/Macro.h" +#include "core/TensorUtils.hpp" + +#ifdef MNN_USE_NEON +#include +#endif + +using Vec = MNN::Math::Vec; +namespace MNN { + +Arm82Moments::Arm82Moments(Backend *backend, const MNN::Op *op) : Execution(backend) { + auto momentsParam = op->main_as_MomentsParam(); + if (momentsParam->dim()) { + for (int i = 0; i < momentsParam->dim()->size(); ++i) { + mAxis.push_back(momentsParam->dim()->data()[i]); + } + } + mKeepDims = momentsParam->keepDims(); + MNN_ASSERT(DataType_DT_FLOAT == momentsParam->dType()); +} + +ErrorCode Arm82Moments::onResize(const std::vector &inputs, const std::vector &outputs) { + return NO_ERROR; +} + +void Arm82Moments::calculateMean(const FLOAT16 *src, FLOAT16 *mean, int channelBlock, int planeNumber) { + const int numberThread = ((Arm82Backend*)backend())->numberThread(); + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + int step = UP_DIV(channelBlock, numberThread), start = tId * step, end = ALIMIN(start + step, channelBlock); + for (int z = start; z < end; ++z) { + const FLOAT16* srcZ = src + z * planeNumber * 8; + FLOAT16* meanZ = mean + z * 8; + + Vec sum(0); + for (int i = 0; i < planeNumber; ++i) { + sum = sum + Vec::load(srcZ + i * 8); + } + Vec result = sum / (float)planeNumber; + Vec::save(meanZ, result); + } + + } MNN_CONCURRENCY_END(); +} + +void Arm82Moments::calculateVariance(const FLOAT16 *src, const FLOAT16 *mean, FLOAT16* var, int channelBlock, int planeNumber) { + const int numberThread = ((Arm82Backend*)backend())->numberThread(); + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + int step = UP_DIV(channelBlock, numberThread), start = tId * step, end = ALIMIN(start + step, channelBlock); + for (int z = start; z < end; ++z) { + const FLOAT16* srcZ = src + z * planeNumber * 8, *meanZ = mean + z * 8; + FLOAT16* varZ = var + z * 8; + + Vec sum(0), meanVal = Vec::load(meanZ); + for (int i = 0; i < planeNumber; ++i) { + Vec diff = Vec::load(srcZ + i * 8) - meanVal; + sum = sum + diff * diff; + } + Vec result = sum / (float)planeNumber; + Vec::save(varZ, result); + } + + } MNN_CONCURRENCY_END(); +} + +ErrorCode Arm82Moments::onExecute(const std::vector &inputs, const std::vector &outputs) { + MNN_ASSERT(1 == inputs.size()); + MNN_ASSERT(2 == outputs.size()); + auto input = inputs[0], mean = outputs[0], variance = outputs[1]; + + // the layout of Moments is NC4HW4, now only support for calculating Moments along height and width + MNN_ASSERT(MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(input)->dimensionFormat); + MNN_ASSERT(mKeepDims); + MNN_ASSERT(mAxis.size() == 2 && mAxis[0] == 2 && mAxis[1] == 3); + + const int batch = input->batch(), channelBlock = UP_DIV(mean->channel(), 8); + const int inBatchStride = ARM82TensorStrideHelper(input, 0), outBatchStride = ARM82TensorStrideHelper(mean, 0); + const int planeNumber = ARM82TensorStrideHelper(input, 1); + // mean + for (int b = 0; b < batch; ++b) { + const FLOAT16* srcPtr = input->host() + b * inBatchStride; + FLOAT16* meanPtr = mean->host() + b * outBatchStride; + calculateMean(srcPtr, meanPtr, channelBlock, planeNumber); + } + // variance + for (int b = 0; b < batch; ++b) { + const FLOAT16* srcPtr = input->host() + b * inBatchStride; + const FLOAT16* meanPtr = mean->host() + b * outBatchStride; + FLOAT16* variancePtr = variance->host() + b * outBatchStride; + calculateVariance(srcPtr, meanPtr, variancePtr, channelBlock, planeNumber); + } + + return NO_ERROR; +} + +class Arm82MomentsCreator : public Arm82Backend::Arm82Creator { +public: + virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, + const MNN::Op *op, Backend *backend) const override { + return new Arm82Moments(backend, op); + } +}; + +REGISTER_ARM82_OP_CREATOR(OpType_Moments, Arm82MomentsCreator); + +} // namespace MNN +#endif diff --git a/source/backend/arm82/Arm82Moments.hpp b/source/backend/arm82/Arm82Moments.hpp new file mode 100644 index 00000000..56c37ea2 --- /dev/null +++ b/source/backend/arm82/Arm82Moments.hpp @@ -0,0 +1,35 @@ +// +// Arm82Moments.hpp +// MNN +// +// Created by MNN on 2019/02/28. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#ifndef Arm82Moments_hpp +#define Arm82Moments_hpp + +#include "Arm82Backend.hpp" +#include "core/Execution.hpp" + +namespace MNN { + +class Arm82Moments : public Execution { +public: + Arm82Moments(Backend* backend, const MNN::Op* op); + virtual ~Arm82Moments() = default; + virtual ErrorCode onExecute(const std::vector& inputs, const std::vector& outputs) override; + virtual ErrorCode onResize(const std::vector& inputs, const std::vector& outputs) override; + +private: + void calculateMean(const FLOAT16 *src, FLOAT16 *mean, int channelBlock, int planeNumber); + void calculateVariance(const FLOAT16 *src, const FLOAT16 *mean, FLOAT16* var, int channelBlock, int planeNumber); + std::vector mAxis; + bool mKeepDims; +}; + +} // namespace MNN + +#endif /* Arm82Moments_hpp */ +#endif diff --git a/source/backend/arm82/Arm82OpRegister.cpp b/source/backend/arm82/Arm82OpRegister.cpp index 2fd6beee..1e77fcaa 100644 --- a/source/backend/arm82/Arm82OpRegister.cpp +++ b/source/backend/arm82/Arm82OpRegister.cpp @@ -1,26 +1,28 @@ // This file is generated by Shell for ops register namespace MNN { -extern void ___OpType_ConvolutionDepthwise__Arm82ConvolutionDepthwiseCreator__(); +extern void ___OpType_Moments__Arm82MomentsCreator__(); extern void ___OpType_Raster__Arm82RasterFactory__(); extern void ___OpType_Pooling__Arm82PoolingCreator__(); +extern void ___OpType_InstanceNorm__Arm82InstanceNormCreator__(); extern void ___OpType_Eltwise__Arm82EltwiseCreator__(); extern void ___OpType_ReLU__Arm82ReluCreator__(); extern void ___OpType_PReLU__Arm82ReluCreator__(); extern void ___OpType_BinaryOp__Arm82BinaryCreator__(); extern void ___OpType_Interp__Arm82InterpCreator__(); -extern void ___OpType_Convolution__Arm82ConvolutionCreator__(); +extern void ___OpType_UnaryOp__Arm82UnaryCreator__(); void registerArm82Ops() { -#ifdef __aarch64__ -___OpType_ConvolutionDepthwise__Arm82ConvolutionDepthwiseCreator__(); +#if defined(__ANDROID__) || defined(__aarch64__) +___OpType_Moments__Arm82MomentsCreator__(); ___OpType_Raster__Arm82RasterFactory__(); ___OpType_Pooling__Arm82PoolingCreator__(); +___OpType_InstanceNorm__Arm82InstanceNormCreator__(); ___OpType_Eltwise__Arm82EltwiseCreator__(); ___OpType_ReLU__Arm82ReluCreator__(); ___OpType_PReLU__Arm82ReluCreator__(); ___OpType_BinaryOp__Arm82BinaryCreator__(); ___OpType_Interp__Arm82InterpCreator__(); -___OpType_Convolution__Arm82ConvolutionCreator__(); +___OpType_UnaryOp__Arm82UnaryCreator__(); #endif } } diff --git a/source/backend/arm82/Arm82OptFunc.cpp b/source/backend/arm82/Arm82OptFunc.cpp index 2db3e445..08ddef5b 100644 --- a/source/backend/arm82/Arm82OptFunc.cpp +++ b/source/backend/arm82/Arm82OptFunc.cpp @@ -5,27 +5,71 @@ // Created by MNN on 2019/02/06. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ -#include "backend/arm82/Arm82OptFunc.hpp" +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82OptFunc.hpp" +#include "Arm82Vec.hpp" #include "core/Macro.h" #include "half.hpp" + +#ifdef MNN_USE_NEON #include -void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size) { - int sizeDiv4 = size / 4; - int remain = size - sizeDiv4 * 4; +#endif - if (sizeDiv4 > 0) { - MNNQuantizeFP16_UNIT4(dst, src, sizeDiv4); +extern "C" { +void MNNExpFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* params, size_t blockCount); + +void MNNQuantizeFP16_UNIT4(int16_t* dst, const float* src, int size); + +} + +void Arm82MNNExp(FLOAT16* dst, const FLOAT16* src, size_t dataSize) { + int blockCount = dataSize / 16; + if (blockCount > 0) { + static FLOAT16 params[] = { + (FLOAT16)log(2.0f), (FLOAT16)(1.0f / log(2.0f)), 1.0f, 1.0f, 0.5f, 1.0f / 6.0f, 1.0f / 24.0f, 1.0f / 120.0f}; + MNNExpFP16(dst, src, params, blockCount); } - - if (remain > 0) { - for (int i = sizeDiv4 * 4; i < size; ++i) { - dst[i] = half_float::half(src[i]); - } + FLOAT16 xLimit = 11, expStep = log(2.0f), expStep_r = 1.0f / expStep; + for (int i = blockCount * 16; i < dataSize; ++i) { + auto x = -src[i]; + x = ALIMAX(x, -xLimit); + x = ALIMIN(x, xLimit); + int div = x * expStep_r, expBasicRaw = (div + 15) << 10; + FLOAT16 t = x - div * expStep, expBasic = *(FLOAT16*)(&expBasicRaw); + FLOAT16 expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f; + dst[i] = (FLOAT16)(expBasic * expRemain); } } -void MNNDequantizeFP16(float* dst, const int16_t* srcint, int size) { +void Arm82MNNGetMatMulPackMode(int* eP, int *lP, int* hP) { +#ifdef __aarch64__ + *hP = 16; +#else + *hP = 8; +#endif + *eP = 12; + *lP = 1; +} + +void MNNQuantizeFP16(const float* src, int16_t* dst, size_t size) { + int sizeDiv4 = size / 4; + int remain = size - sizeDiv4 * 4; + if (sizeDiv4 > 0) { + MNNQuantizeFP16_UNIT4(dst, src, sizeDiv4); + src += sizeDiv4 * 4; + dst += sizeDiv4 * 4; + } + if (remain > 0) { + float tempSrc[4]; + int16_t tempDst[4]; + ::memcpy(tempSrc, src, remain * sizeof(float)); + MNNQuantizeFP16_UNIT4(tempDst, tempSrc, 1); + ::memcpy(dst, tempDst, remain * sizeof(int16_t)); + } +} + +void MNNDequantizeFP16(const int16_t* srcint, float* dst, size_t size) { auto src = (const FLOAT16*)srcint; int sizeDiv4 = size / 4; int remain = size - sizeDiv4 * 4; @@ -47,10 +91,18 @@ void MNNDequantizeFP16(float* dst, const int16_t* srcint, int size) { } } -void MNNNC4HW4TONC8HW8(uint16_t* dst, const float* source, size_t plane, size_t channel) { +void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t plane, size_t channel) { + MNNPackUNIT(dest, source, plane, channel); +} + +void MNNUnPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t plane, size_t channel) { + MNNUnpackUNIT(dest, source, plane, channel); +} + +void MNNNC4HW4TONC8HW8(FLOAT16* dst, const float* source, size_t plane, size_t channel) { const int c4 = UP_DIV(channel, 4); const int c8 = UP_DIV(channel, 8); - memset(dst, 0, plane * c8 * 8 * sizeof(uint16_t)); + memset(dst, 0, plane * c8 * 8 * sizeof(FLOAT16)); #if defined(MNN_USE_NEON) && defined(__aarch64__) auto dest = (float16_t*)dst; #else @@ -78,7 +130,7 @@ void MNNNC4HW4TONC8HW8(uint16_t* dst, const float* source, size_t plane, size_t } } -void MNNNC8HW8TONC4HW4(float* dest, const uint16_t* src, size_t plane, size_t channel) { +void MNNNC8HW8TONC4HW4(float* dest, const FLOAT16* src, size_t plane, size_t channel) { const int c4 = UP_DIV(channel, 4); #if defined(MNN_USE_NEON) && defined(__aarch64__) auto source = (float16_t*)src; @@ -106,7 +158,7 @@ void MNNNC8HW8TONC4HW4(float* dest, const uint16_t* src, size_t plane, size_t ch } } -void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t channel) { +void MNNNC8HW8TONHWC(float* dest, const FLOAT16* src, size_t plane, size_t channel) { int c = (int)channel; int cDiv8 = c / 8; int cAlign = cDiv8 * 8; @@ -115,32 +167,28 @@ void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t chan #else auto source = src; #endif - for (int hi = 0; hi < plane; ++hi) { const auto srcHeight = source + hi * 8; float* dstHeight = dest + hi * c; for (int ci = 0; ci < cDiv8; ++ci) { -#ifdef MNN_USE_NEON +#if defined(MNN_USE_NEON) && defined(__aarch64__) float16x8_t a = vld1q_f16(srcHeight + 8 * ci * plane); vst1q_f32(dstHeight + 8 * ci, vcvt_high_f32_f16(a)); #else half_float::half dataHalf[8]; - memcpy(dataHalf, srcHeight + 8 * ci * plane, 8 * sizeof(uint16_t)); + memcpy(dataHalf, srcHeight + 8 * ci * plane, 8 * sizeof(FLOAT16)); for (int i = 0; i < 8; ++i) { dstHeight[ci * 8 + i] = float(dataHalf[i]); } #endif } } - if (cAlign == c) { return; } - int cReamin = c - cAlign; const auto srcAlign = reinterpret_cast(source + plane * cAlign); auto dstAlign = dest + cAlign; - for (int hi = 0; hi < plane; ++hi) { const auto srcHeight = srcAlign + hi * 8; float* dstHeight = dstAlign + hi * c; @@ -150,23 +198,4 @@ void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t chan } } } - -void MNNNCHWTONC8HW8(uint16_t* dest, const float* source, size_t plane, size_t channel) { - auto halfDest = reinterpret_cast(dest); - MNNPackUNIT(halfDest, source, plane, channel); -} - -void MNNNC8HW8TONCHW(float* dest, const uint16_t* source, size_t plane, size_t channel) { - auto halfSrc = reinterpret_cast(source); - MNNUnpackUNIT(dest, halfSrc, plane, channel); -} - -void MNNNCHWTONC8HW8_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel) { - MNNPackUNIT(dest, source, plane, channel); -} - -void MNNNC8HW8TONCHW_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel) { - MNNUnpackUNIT(dest, source, plane, channel); -} - #endif diff --git a/source/backend/arm82/Arm82OptFunc.hpp b/source/backend/arm82/Arm82OptFunc.hpp index a69b7dcf..6cbd5c43 100644 --- a/source/backend/arm82/Arm82OptFunc.hpp +++ b/source/backend/arm82/Arm82OptFunc.hpp @@ -5,116 +5,61 @@ // Created by MNN on 2019/02/06. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #ifndef Arm82OptFunc_hpp #define Arm82OptFunc_hpp -#include "backend/arm82/Arm82Backend.hpp" +#include "Arm82Backend.hpp" #include "core/Macro.h" -#define DST_XUNIT 8 - -#ifdef __cplusplus -extern "C" { -#endif - -void MNNGemmFP16C8_UNIT(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, const FLOAT16* bias, size_t src_loop, - size_t dst_step, size_t dst_loop, size_t relu, size_t relu6, size_t realDstCount); - -void MNNShuffleChannelC8(FLOAT16* dst, const FLOAT16* src, size_t size, size_t halfFlag); -void MNNQuantizeFP16_UNIT4(FLOAT16* dst, const float* src, int size); -void MNNDequantizeFP16(float* dst, const int16_t* src, int size); - -#ifdef __cplusplus -} -#endif - -void MNNQuantizeFP16(FLOAT16* dst, const float* src, int size); - +void Arm82MNNGetMatMulPackMode(int* eP, int *lP, int* hP); +void Arm82MNNExp(FLOAT16* dst, const FLOAT16* src, size_t dataSize); +void MNNQuantizeFP16(const float* src, int16_t* dst, size_t size); +void MNNDequantizeFP16(const int16_t* src, float* dst, size_t size); +void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth); +void MNNUnPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth); // nc4hw4 to nc8hw8(aka fp32 -> fp16), convete dataformat and data type -void MNNNC4HW4TONC8HW8(uint16_t* dest, const float* source, size_t plane, size_t channel); +void MNNNC4HW4TONC8HW8(FLOAT16* dest, const float* source, size_t plane, size_t channel); // nc8hw8 to nc4hw4(aka fp16 -> fp32) -void MNNNC8HW8TONC4HW4(float* dest, const uint16_t* source, size_t plane, size_t channel); -// nchw to nc8hw8(aka fp32 -> fp16) -void MNNNCHWTONC8HW8(uint16_t* dest, const float* source, size_t plane, size_t channel); -// nc8hw8 to nchw(aka fp16 -> fp32) -void MNNNC8HW8TONCHW(float* dest, const uint16_t* source, size_t plane, size_t channel); - -void MNNNC8HW8TONHWC(float* dest, const uint16_t* src, size_t plane, size_t channel); - -void MNNNCHWTONC8HW8_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel); -void MNNNC8HW8TONCHW_NO_TYPE(uint16_t* dest, const uint16_t* source, size_t plane, size_t channel); +void MNNNC8HW8TONC4HW4(float* dest, const FLOAT16* source, size_t plane, size_t channel); template void MNNPackUNIT(TOUT* dst, const TIN* src, size_t area, size_t depth) { - int depthCUnit = depth / UNIT; - int depthRemain = depthCUnit * UNIT; - int remain = depth - depthRemain; - int z, x, y; - const TIN* srcChannel[UNIT]; - const TIN* srcOffset = src; - for(z = 0; z < depthCUnit; ++z) { - for(y = 0; y < UNIT; ++y) { - srcChannel[y] = srcOffset + area * y; - } - for(x = 0; x < area; ++x) { - for(y = 0; y < UNIT; ++y) { - dst[0] = TOUT(srcChannel[y][0]); - srcChannel[y]++; - dst++; - } - } - srcOffset += area * UNIT; - } - if(remain > 0){ - for(y = 0; y < remain; ++y) { - srcChannel[y] = srcOffset + area * y; - } - for(x = 0; x < area; ++x) { - for(y = 0; y < remain; ++y) { - dst[0] = TOUT(srcChannel[y][0]); - srcChannel[y]++; - dst++; - } - for(y = remain; y < UNIT; ++y) { - dst[0] = 0; - dst++; - } + int z, x; + int cur = 0; + memset(dst, 0, area * UP_DIV(depth, UNIT) * UNIT * sizeof(TOUT)); + for (z = 0; z < depth; ++z) { + int plane = z / UNIT; + TOUT* dstPlane = plane * area * UNIT + dst; + int offset = z % UNIT; + for (x = 0; x < area; ++x) { + dstPlane[UNIT * x + offset] = TOUT(src[cur++]); } } } template void MNNUnpackUNIT(TOUT* dst, const TIN* src, size_t area, size_t depth) { - int depthCUnit = depth / UNIT; - int depthRemain = depthCUnit * UNIT; - int remain = depth - depthRemain; - int z, x, y; - const TIN* srcChannel[UNIT]; - const TIN* srcOffset = src; - for(z = 0; z < depthCUnit; ++z) { - for(y = 0; y < UNIT; ++y) { - srcChannel[y] = srcOffset + y; - for(x = 0; x < area; ++x) { - dst[0] = TOUT(srcChannel[y][0]); - srcChannel[y] += UNIT; - dst++; - } - } - srcOffset += area * UNIT; - } - if(remain > 0){ - for(y = 0; y < remain; ++y) { - srcChannel[y] = srcOffset + y; - for(x = 0; x < area; ++x) { - dst[0] = TOUT(srcChannel[y][0]); - srcChannel[y] += UNIT; - dst++; - } + int x; + int z; + int cur = 0; + for (z = 0; z < depth; ++z) { + int plane = z / UNIT; + const TIN* srcPlane = plane * area * UNIT + src; + int offset = z % UNIT; + for (x = 0; x < area; ++x) { + dst[cur++] = TOUT(srcPlane[UNIT * x + offset]); } } } -#endif +template +void MNNSlowCopy(T* dst, const U* src, size_t size) { + for (int i = 0; i < size; ++i) { + dst[i] = (T)src[i]; + } +} +#endif // Arm82OptFunc_hpp #endif diff --git a/source/backend/arm82/Arm82Pooling.cpp b/source/backend/arm82/Arm82Pooling.cpp index f7280aeb..c9a28f2e 100644 --- a/source/backend/arm82/Arm82Pooling.cpp +++ b/source/backend/arm82/Arm82Pooling.cpp @@ -5,8 +5,10 @@ // Created by MNN on 2020/01/08. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ -#include "backend/arm82/Arm82Pooling.hpp" +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82Pooling.hpp" +#include "Arm82Vec.hpp" #include "core/Concurrency.h" #include "core/Macro.h" @@ -14,6 +16,8 @@ #include #endif +using Vec = MNN::Math::Vec; + namespace MNN { static void poolingMaxFp16Unit(FLOAT16 *dst, int outputWidth, int outputHeight, const FLOAT16 *src, int inputWidth, @@ -30,34 +34,16 @@ static void poolingMaxFp16Unit(FLOAT16 *dst, int outputWidth, int outputHeight, auto dstCurPtr = dst + (oy * outputWidth + ox) * ARMV82_CHANNEL_UNIT; -#ifdef MNN_USE_NEON - float16x8_t curIn, curOut; - curOut = vdupq_n_f16(float16_t(-65504.0)); -#else - // init - FLOAT16 curOut[ARMV82_CHANNEL_UNIT]; - for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) { - curOut[i] = -65504.0; - } -#endif + Vec curIn; + Vec curOut(-65504.0); for (int y = kys; y < kye; ++y) { for (int x = kxs; x < kxe; ++x) { const int inOffset = ((srcOriginY + y) * inputWidth + srcOriginX + x) * ARMV82_CHANNEL_UNIT; -#ifdef MNN_USE_NEON - curIn = vld1q_f16(src + inOffset); - curOut = vmaxq_f16(curIn, curOut); -#else - for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) { - curOut[i] = std::max(curOut[i], src[inOffset + i]); - } -#endif + curIn = Vec::load(src + inOffset); + curOut = Vec::max(curIn, curOut); } } -#ifdef MNN_USE_NEON - vst1q_f16(dstCurPtr, curOut); -#else - memcpy(dstCurPtr, curOut, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); -#endif + Vec::save(dstCurPtr, curOut); } } } @@ -77,39 +63,15 @@ static void poolingAvgFp16Unit(FLOAT16 *dst, int outputWidth, int outputHeight, auto dstCurPtr = dst + (oy * outputWidth + ox) * ARMV82_CHANNEL_UNIT; -#ifdef MNN_USE_NEON - float16x8_t curIn, curOut; - curOut = vdupq_n_f16(float16_t(0)); - float16x8_t size = vdupq_n_f16(float16_t(kernelCount)); -#else - // init - FLOAT16 curOut[ARMV82_CHANNEL_UNIT]; - for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) { - curOut[i] = 0; - } -#endif + Vec curOut(0), size(kernelCount); for (int y = kys; y < kye; ++y) { for (int x = kxs; x < kxe; ++x) { const int inOffset = ((srcOriginY + y) * inputWidth + srcOriginX + x) * ARMV82_CHANNEL_UNIT; const auto srcUnit = src + inOffset; -#ifdef MNN_USE_NEON - curIn = vld1q_f16(srcUnit); - curOut = vaddq_f16(curIn, curOut); -#else - for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) { - curOut[i] = curOut[i] + srcUnit[i]; - } -#endif + curOut = curOut + Vec::load(srcUnit); } } -#ifdef MNN_USE_NEON - vst1q_f16(dstCurPtr, vdivq_f16(curOut, size)); -#else - for (int i = 0; i < ARMV82_CHANNEL_UNIT; ++i) { - curOut[i] = curOut[i] / kernelCount; - } - memcpy(dstCurPtr, curOut, sizeof(FLOAT16) * ARMV82_CHANNEL_UNIT); -#endif + Vec::save(dstCurPtr, curOut / size); } } } @@ -192,11 +154,7 @@ ErrorCode Arm82Pooling::onExecute(const std::vector &inputs, const std MNN_CONCURRENCY_BEGIN(tId, mThreadNumber) mThreadFunction((int)tId, srcOrigin, dstOrigin); -#ifdef MNN_USE_THREAD_POOL MNN_CONCURRENCY_END(); -#else - MNN_CONCURRENCY_END(); -#endif } return NO_ERROR; @@ -212,4 +170,4 @@ class Arm82PoolingCreator : public Arm82Backend::Arm82Creator { REGISTER_ARM82_OP_CREATOR(OpType_Pooling, Arm82PoolingCreator); } // namespace MNN -#endif \ No newline at end of file +#endif diff --git a/source/backend/arm82/Arm82Pooling.hpp b/source/backend/arm82/Arm82Pooling.hpp index d864eb76..d970c54d 100644 --- a/source/backend/arm82/Arm82Pooling.hpp +++ b/source/backend/arm82/Arm82Pooling.hpp @@ -5,12 +5,13 @@ // Created by MNN on 2020/01/08. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #ifndef Arm82Pooling_hpp #define Arm82Pooling_hpp #include "MNN_generated.h" -#include "backend/arm82/Arm82Backend.hpp" +#include "Arm82Backend.hpp" #include "core/Execution.hpp" namespace MNN { diff --git a/source/backend/arm82/Arm82Raster.cpp b/source/backend/arm82/Arm82Raster.cpp index 1194f3f1..32179f50 100644 --- a/source/backend/arm82/Arm82Raster.cpp +++ b/source/backend/arm82/Arm82Raster.cpp @@ -5,7 +5,7 @@ // Created by MNN on 2020/5/25. // Copyright © 2018 Alibaba. All rights reserved. // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) #include "Arm82Raster.hpp" #include "math/Vec.hpp" diff --git a/source/backend/arm82/Arm82Raster.hpp b/source/backend/arm82/Arm82Raster.hpp index ed91d56f..aff64838 100644 --- a/source/backend/arm82/Arm82Raster.hpp +++ b/source/backend/arm82/Arm82Raster.hpp @@ -5,10 +5,10 @@ // Created by MNN on 2020/5/25. // Copyright © 2018 Alibaba. All rights reserved. // +#if defined(__ANDROID__) || defined(__aarch64__) #ifndef Arm82Raster_hpp #define Arm82Raster_hpp -#ifdef __aarch64__ #include "Arm82Backend.hpp" #include "core/Execution.hpp" #include @@ -35,5 +35,5 @@ private: bool mFast = false; }; } -#endif #endif /* Arm82Raster_hpp */ +#endif diff --git a/source/backend/arm82/Arm82Register.py b/source/backend/arm82/Arm82Register.py index 2cfdadfd..4ff5666b 100644 --- a/source/backend/arm82/Arm82Register.py +++ b/source/backend/arm82/Arm82Register.py @@ -31,7 +31,7 @@ def generateCPUFile(rootDir): f.write("extern void " + l + '();\n') f.write('\n') f.write('void registerArm82Ops() {\n') - f.write("#ifdef __aarch64__\n") + f.write("#if defined(__ANDROID__) || defined(__aarch64__)\n") for l in funcNames: f.write(l+'();\n') f.write("#endif\n") diff --git a/source/backend/arm82/Arm82Relu.cpp b/source/backend/arm82/Arm82Relu.cpp index 73fc17d8..8c63f11d 100644 --- a/source/backend/arm82/Arm82Relu.cpp +++ b/source/backend/arm82/Arm82Relu.cpp @@ -5,17 +5,18 @@ // Created by MNN on 2020/2/13. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #include -#include "backend/arm82/Arm82Relu.hpp" +#include "Arm82Relu.hpp" #include "MNN_generated.h" -#include "backend/arm82/Arm82Backend.hpp" -#include "backend/arm82/Arm82OptFunc.hpp" +#include "Arm82Backend.hpp" +#include "Arm82OptFunc.hpp" #include "core/Concurrency.h" #include "core/Macro.h" #include "half.hpp" - +#include #ifdef MNN_USE_NEON #include #endif @@ -32,7 +33,7 @@ static void _MNNArm82PReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FL #ifdef MNN_USE_NEON float16x8_t value = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT); float16x8_t mulSlope = vmulq_f16(value, slopeV); - float16x8_t lessThanZero = vcleq_f16(value, value_0); + uint16x8_t lessThanZero = vcleq_f16(value, value_0); vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, mulSlope, value)); #else @@ -50,52 +51,51 @@ static void _MNNArm82PReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FL } static void _MNNArm82LeakyReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, const FLOAT16 slope, size_t length) { -#ifdef MNN_USE_NEON float16x8_t value_0 = vmovq_n_f16(0); float16x8_t slopeV = vmovq_n_f16(slope); -#endif + auto lC8 = length / ARMV82_CHANNEL_UNIT; + auto remain = length % ARMV82_CHANNEL_UNIT; - for (int i = 0; i < length; ++i) { -#ifdef MNN_USE_NEON - float16x8_t value = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT); + for (int i = 0; i < lC8; ++i) { + float16x8_t value = vld1q_f16(src); float16x8_t mulSlope = vmulq_f16(value, slopeV); - float16x8_t lessThanZero = vcleq_f16(value, value_0); - - vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, mulSlope, value)); -#else - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - int index = i * ARMV82_CHANNEL_UNIT + j; - if (src[index] < 0) { - dst[index] = src[index] * slope; - } else { - dst[index] = src[index]; - } - } -#endif + uint16x8_t lessThanZero = vcleq_f16(value, value_0); + vst1q_f16(dst, vbslq_f16(lessThanZero, mulSlope, value)); + src += ARMV82_CHANNEL_UNIT; + dst += ARMV82_CHANNEL_UNIT; + } + if (remain > 0) { + float16_t tempSrc[ARMV82_CHANNEL_UNIT]; + float16_t tempDst[ARMV82_CHANNEL_UNIT]; + ::memcpy(tempSrc, src, remain * sizeof(int16_t)); + float16x8_t value = vld1q_f16(tempSrc); + float16x8_t mulSlope = vmulq_f16(value, slopeV); + uint16x8_t lessThanZero = vcleq_f16(value, value_0); + vst1q_f16(tempDst, vbslq_f16(lessThanZero, mulSlope, value)); + ::memcpy(dst, tempDst, remain * sizeof(int16_t)); } } static void _MNNArm82ReluWithChannel(FLOAT16 *dst, const FLOAT16 *src, size_t length) { -#ifdef MNN_USE_NEON float16x8_t value_0 = vmovq_n_f16(0); -#endif + auto lC8 = length / ARMV82_CHANNEL_UNIT; + auto remain = length % ARMV82_CHANNEL_UNIT; + for (int i = 0; i < lC8; ++i) { + float16x8_t value = vld1q_f16(src); + uint16x8_t lessThanZero = vcleq_f16(value, value_0); - for (int i = 0; i < length; ++i) { -#ifdef MNN_USE_NEON - float16x8_t value = vld1q_f16(src + i * ARMV82_CHANNEL_UNIT); - float16x8_t lessThanZero = vcleq_f16(value, value_0); - - vst1q_f16(dst + i * ARMV82_CHANNEL_UNIT, vbslq_f16(lessThanZero, value_0, value)); -#else - for (int j = 0; j < ARMV82_CHANNEL_UNIT; ++j) { - int index = i * ARMV82_CHANNEL_UNIT + j; - if (src[index] < 0) { - dst[index] = 0; - } else { - dst[index] = src[index]; - } - } -#endif + vst1q_f16(dst, vbslq_f16(lessThanZero, value_0, value)); + dst += ARMV82_CHANNEL_UNIT; + src += ARMV82_CHANNEL_UNIT; + } + if (remain > 0) { + float16_t tempSrc[ARMV82_CHANNEL_UNIT]; + float16_t tempDst[ARMV82_CHANNEL_UNIT]; + ::memcpy(tempSrc, src, remain * sizeof(int16_t)); + float16x8_t value = vld1q_f16(tempSrc); + uint16x8_t lessThanZero = vcleq_f16(value, value_0); + vst1q_f16(tempDst, vbslq_f16(lessThanZero, value_0, value)); + ::memcpy(dst, tempDst, remain * sizeof(int16_t)); } } @@ -106,41 +106,37 @@ Arm82Relu::Arm82Relu(Backend *backend, float slope) : Execution(backend) { ErrorCode Arm82Relu::onExecute(const std::vector &inputs, const std::vector &outputs) { auto input = inputs[0]; auto output = outputs[0]; - const int dimension = input->dimensions(); - MNN_ASSERT(4 == dimension); - const int batch = input->batch(); - const int channel = input->channel(); - const int width = input->width(); - const int height = input->height(); - const int channelDivUnit = UP_DIV(channel, ARMV82_CHANNEL_UNIT); - const int batchAndChannel = batch * channelDivUnit; - const int plane = width * height; - + auto size = ARM82TensorElementSizeHelper(input); + auto schedule = static_cast(backend())->multiThreadDivide(size); + const auto src = input->host(); auto dst = output->host(); if (abs(mSlope) < std::numeric_limits::epsilon()) { // relu - mThreadNumbers = static_cast(backend())->numberThread(); - MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers) - for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) { - _MNNArm82ReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, - src + b * plane * ARMV82_CHANNEL_UNIT, - plane); - } - MNN_CONCURRENCY_END(); + MNN_CONCURRENCY_BEGIN(tId, schedule.second) { + int start = schedule.first * (int)tId; + int realSize = schedule.first; + if (tId == schedule.second -1 ) { + realSize = size - start; + } + + _MNNArm82ReluWithChannel(dst + start, + src + start, realSize); + } MNN_CONCURRENCY_END(); } else { // leakyrelu FLOAT16 slopeHalf = half_float::half(mSlope); - mThreadNumbers = static_cast(backend())->numberThread(); - MNN_CONCURRENCY_BEGIN(tId, mThreadNumbers) - for (int b = (int)tId; b < batchAndChannel; b += mThreadNumbers) { - _MNNArm82LeakyReluWithChannel(dst + b * plane * ARMV82_CHANNEL_UNIT, - src + b * plane * ARMV82_CHANNEL_UNIT, - slopeHalf, - plane); - } - MNN_CONCURRENCY_END(); + MNN_CONCURRENCY_BEGIN(tId, schedule.second) { + int start = schedule.first * (int)tId; + int realSize = schedule.first; + if (tId == schedule.second -1 ) { + realSize = size - start; + } + + _MNNArm82LeakyReluWithChannel(dst + start, + src + start, slopeHalf, realSize); + } MNN_CONCURRENCY_END(); } return NO_ERROR; @@ -154,16 +150,14 @@ Arm82PRelu::Arm82PRelu(Backend *backend, const Op *op) : Execution(backend) { if (!allocRes) { return; } - auto slopePtr = mSlope->host(); - MNNQuantizeFP16(slopePtr, param->slope()->data(), slopeLength); + auto slopePtr = mSlope->host(); + MNNQuantizeFP16(param->slope()->data(), slopePtr, slopeLength); } ErrorCode Arm82PRelu::onExecute(const std::vector &inputs, const std::vector &outputs) { const auto input = inputs[0]; auto output = outputs[0]; - const int dimension = input->dimensions(); - MNN_ASSERT(4 == dimension); const int batch = input->batch(); const int channel = input->channel(); const int width = input->width(); diff --git a/source/backend/arm82/Arm82Relu.hpp b/source/backend/arm82/Arm82Relu.hpp index edd863a7..0a005943 100644 --- a/source/backend/arm82/Arm82Relu.hpp +++ b/source/backend/arm82/Arm82Relu.hpp @@ -5,7 +5,8 @@ // Created by MNN on 2020/2/13. // Copyright © 2018, Alibaba Group Holding Limited // -#ifdef __aarch64__ +#if defined(__ANDROID__) || defined(__aarch64__) + #ifndef Arm82Relu_hpp #define Arm82Relu_hpp @@ -21,7 +22,6 @@ public: private: float mSlope = 0.0; - int mThreadNumbers; }; class Arm82PRelu : public Execution { diff --git a/source/backend/arm82/Arm82Unary.cpp b/source/backend/arm82/Arm82Unary.cpp new file mode 100644 index 00000000..779ee420 --- /dev/null +++ b/source/backend/arm82/Arm82Unary.cpp @@ -0,0 +1,237 @@ +// +// Arm82Unary.cpp +// MNN +// +// Created by MNN on 2018/08/02. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#include +#include +#include +#include "Arm82Unary.hpp" +#include "Arm82Backend.hpp" +#include "core/Macro.h" +#include "core/OpCommonUtils.hpp" +#include "core/Concurrency.h" +#include "MNN_generated.h" + + +#ifdef MNN_USE_NEON +#include +#endif + +namespace MNN { +Arm82Unary::Arm82Unary(Backend *b, UnaryOpOperation type) : MNN::Execution(b), mType(type) { + // nothing to do +} + +ErrorCode Arm82Unary::onResize(const std::vector &inputs, const std::vector &outputs) { + MNN_ASSERT(1 == outputs.size()); + auto dtype = inputs[0]->getType(); + MNN_ASSERT(dtype == halide_type_of() || dtype == halide_type_of()); + return NO_ERROR; +} + +template +static ErrorCode _unaryOp(void* inputPtr, void* outputPtr, int elementSize, Backend* bn) { + Func f; + auto backend = [bn]() { + return bn; + }; + const T *inputData = (T*)inputPtr; + T *outputData = (T *)outputPtr; + auto numberThread = ((CPUBackend*)bn)->threadNumber(); + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + for (int i=tId; i= 3) { + return x; + } else { + return x * (x + 3) / 6; + } + } +#ifdef MNN_USE_NEON + static float16x8_t vecFunc(const float16x8_t& x) { + float16x8_t value_l = vmovq_n_f16(-3); + float16x8_t value_h = vmovq_n_f16(3); + float16x8_t value_d = vmovq_n_f16(1.f/6); + float16x8_t value_z = vmovq_n_f16(0); + uint16x8_t right = vcleq_f16(x, value_l); + float16x8_t middle = vmulq_f16(vmulq_f16(x, vaddq_f16(x, value_h)), value_d); + float16x8_t tmp = vbslq_f16(right, x, middle); + uint16x8_t left = vcgtq_f16(x, value_l); + return vbslq_f16(left, tmp, value_z); + } +#endif +}; + +template +ErrorCode Arm82Unary::onExecuteInternal(Tensor* input, Tensor* output) { + const int threadNum = ((Arm82Backend*)backend())->threadNumber(); + const int count = ARM82TensorElementSizeHelper(output); + const FLOAT16* inputData = input->host(); + FLOAT16* outputData = output->host(); + + MNN_CONCURRENCY_BEGIN(tId, threadNum) { + int realSize = UP_DIV(UP_DIV(count, ARMV82_CHANNEL_UNIT), threadNum) * ARMV82_CHANNEL_UNIT; + int startIndex = tId * realSize, endIndex = ALIMIN(startIndex + realSize, count); + if (endIndex > startIndex) { + int index = startIndex, readSizeUnit = realSize / ARMV82_CHANNEL_UNIT; +#ifdef MNN_USE_NEON + for (int i = 0; i < readSizeUnit; ++i, index += ARMV82_CHANNEL_UNIT) { + float16x8_t in = vld1q_f16(inputData + index); + vst1q_f16(outputData + index, Helper::vecFunc(in)); + } +#endif + for (; index < endIndex; ++index) { + outputData[index] = Helper::scalarFunc(inputData[index]); + } + } + } MNN_CONCURRENCY_END(); + + return NO_ERROR; +} + +ErrorCode Arm82Unary::onExecute(const std::vector &inputs, const std::vector &outputs) { + auto input = inputs[0]; + auto output = outputs[0]; + ErrorCode code; + + switch (mType) { + case UnaryOpOperation_ABS: + code = onExecuteInternal(input, output); + break; + case UnaryOpOperation_SQUARE: + code = onExecuteInternal(input, output); + break; + case UnaryOpOperation_RSQRT: + code = onExecuteInternal(input, output); + break; + case UnaryOpOperation_NEG: + code = onExecuteInternal(input, output); + break; +#if defined(__aarch64__) + case UnaryOpOperation_SQRT: + code = onExecuteInternal(input, output); + break; +#endif + case UnaryOpOperation_RECIPROCAL: + code = onExecuteInternal(input, output); + break; + case UnaryOpOperation_HARDSWISH: + code = onExecuteInternal(input, output); + break; + default: + MNN_ASSERT(false); + break; + } + + return code; +} + +class Arm82UnaryCreator : public Arm82Backend::Arm82Creator { +public: + virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, + const MNN::Op *op, Backend *backend) const override { + auto type = op->main_as_UnaryOp()->opType(); + std::vector supportOps = { + UnaryOpOperation_ABS, UnaryOpOperation_SQUARE, UnaryOpOperation_RSQRT, + UnaryOpOperation_NEG, UnaryOpOperation_SQRT, UnaryOpOperation_RECIPROCAL + }; + if (std::find(supportOps.begin(), supportOps.end(), type) != supportOps.end()) { + return new Arm82Unary(backend, type); + } + return nullptr; + } +}; + +REGISTER_ARM82_OP_CREATOR(OpType_UnaryOp, Arm82UnaryCreator); + +} // namespace MNN + +#endif diff --git a/source/backend/arm82/Arm82Unary.hpp b/source/backend/arm82/Arm82Unary.hpp new file mode 100644 index 00000000..22645ad4 --- /dev/null +++ b/source/backend/arm82/Arm82Unary.hpp @@ -0,0 +1,30 @@ +// +// Arm82Unary.hpp +// MNN +// +// Created by MNN on 2018/08/02. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#ifndef Arm82Unary_hpp +#define Arm82Unary_hpp + +#include "core/Execution.hpp" +#include "MNN_generated.h" + +namespace MNN { +class Arm82Unary : public Execution { +public: + Arm82Unary(Backend *b, UnaryOpOperation type); + virtual ~Arm82Unary() = default; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + template ErrorCode onExecuteInternal(Tensor*, Tensor*); + +protected: + UnaryOpOperation mType; +}; +} // namespace MNN +#endif /* Arm82Unary_hpp */ +#endif diff --git a/source/backend/arm82/Arm82Vec.hpp b/source/backend/arm82/Arm82Vec.hpp new file mode 100644 index 00000000..9b5262d5 --- /dev/null +++ b/source/backend/arm82/Arm82Vec.hpp @@ -0,0 +1,117 @@ +// +// Arm82Vec.hpp +// MNN +// +// Created by MNN on 2019/01/31. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#ifndef Arm82Vec_hpp +#define Arm82Vec_hpp + +#include "Arm82Backend.hpp" +#include "math/Vec.hpp" + +#ifdef MNN_USE_NEON +namespace MNN { +namespace Math { +template<> +struct Vec { + using VecType = Vec; + float16x8_t value; + Vec() { + } + Vec(const float v) { + value = vdupq_n_f16(v); + } + Vec(const float16x8_t v) { + value = v; + } + Vec(const VecType& lr) { + value = lr.value; + } + Vec(const VecType&& lr) { + value = std::move(lr.value); + } + float operator[](size_t i) { + return value[i]; + } + static VecType load(const FLOAT16* addr) { + VecType v = { vld1q_f16(addr) }; + return v; + } + static void save(FLOAT16* addr, const VecType& v) { + vst1q_f16(addr, v.value); + } + static VecType max(const VecType& v1, const VecType& v2) { + VecType dst = { vmaxq_f16(v1.value, v2.value) }; + return dst; + } + static VecType min(const VecType& v1, const VecType& v2) { + VecType dst = { vminq_f16(v1.value, v2.value) }; + return dst; + } + static void mla(VecType& v1, const VecType& v2, const VecType& v3) { + v1.value = vfmaq_f16(v1.value, v2.value, v3.value); + } + static void mls(VecType& v1, const VecType& v2, const VecType& v3) { + v1.value = vfmsq_f16(v1.value, v2.value, v3.value); + } + VecType operator+(const VecType& lr) { + VecType dst = { vaddq_f16(value, lr.value) }; + return dst; + } + VecType operator-(const VecType& lr) { + VecType dst = { vsubq_f16(value, lr.value) }; + return dst; + } + VecType operator*(float lr) { + VecType dst = { vmulq_n_f16(value, lr) }; + return dst; + } + VecType operator*(const VecType& lr) { + VecType dst = { vmulq_f16(value, lr.value) }; + return dst; + } + VecType operator/(float lr) { +#if defined(__aarch64__) + VecType dst = { vdivq_f16(value, vdupq_n_f16(lr)) }; +#else + VecType dst; + for (int i = 0; i < 8; ++i) { + dst.value[i] = value[i] / lr; + } +#endif + return dst; + } + VecType operator/(const VecType& lr) { +#if defined(__aarch64__) + VecType dst = { vdivq_f16(value, lr.value) }; +#else + VecType dst; + for (int i = 0; i < 8; ++i) { + dst.value[i] = value[i] / lr.value[i]; + } +#endif + return dst; + } + VecType& operator=(const VecType& lr) { + value = lr.value; + return *this; + } + VecType& operator=(const VecType&& lr) { + value = std::move(lr.value); + return *this; + } + VecType operator-() { + VecType dst = { vnegq_f16(value) }; + return dst; + } +}; +} // namespace Math +} // namespace MNN +#endif /* MNN_USE_NEON */ + +#endif // Arm82Vec_hpp +#endif diff --git a/source/backend/arm82/Arm82WinogradOptFunc.cpp b/source/backend/arm82/Arm82WinogradOptFunc.cpp new file mode 100644 index 00000000..ac2d9f92 --- /dev/null +++ b/source/backend/arm82/Arm82WinogradOptFunc.cpp @@ -0,0 +1,209 @@ +// +// Arm82WinogradOptFunc.cpp +// MNN +// +// Created by MNN on 2018/10/08. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#include "Arm82WinogradOptFunc.hpp" +#include "Arm82Vec.hpp" +#include "Arm82OptFunc.hpp" +#include +#include +#include "core/Macro.h" +#include "math/Vec.hpp" +using Vec = MNN::Math::Vec; + +namespace MNN { + +static void _sourceTransformUnit4x4(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + + auto m0 = s0 - s2; + auto m1 = s1 + s2; + auto m2 = s2 - s1; + auto m3 = s3 - s1; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); + Vec::save(dstStart + 2 * dstStep, m2); + Vec::save(dstStart + 3 * dstStep, m3); +} +static void _destTransformUnit4x2(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + + auto m0 = s0 + s1 + s2; + auto m1 = (s1 - s2) + s3; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); +} +static void _destTransformUnit4x3(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + + auto m0 = s0 + s1 + s2; + auto m1 = (s1 - s2); + auto m2 = (s1 + s2) + s3; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); + Vec::save(dstStart + 2 * dstStep, m2); +} + +#define LOAD6 \ +Vec s0 = Vec::load(srcBlock + 0 * srcStep); \ +Vec s1 = Vec::load(srcBlock + 1 * srcStep); \ +Vec s2 = Vec::load(srcBlock + 2 * srcStep); \ +Vec s3 = Vec::load(srcBlock + 3 * srcStep); \ +Vec s4 = Vec::load(srcBlock + 4 * srcStep); \ +Vec s5 = Vec::load(srcBlock + 5 * srcStep); + +static void _sourceTransformUnit6x6(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + LOAD6; + Vec m0 = s0 * (FLOAT16)4 - s2 * (FLOAT16)5 + s4; + + Vec m1 = (s1 + s2) * (-(FLOAT16)4) + (s3 + s4); + Vec m2 = (s1 - s2) * ((FLOAT16)4) + (s4 - s3); + + Vec m3 = s1 * -(FLOAT16)2 - s2 + s3 * (FLOAT16)2 + s4; + Vec m4 = s1 * (FLOAT16)2 - s2 - s3 * (FLOAT16)2 + s4; + + Vec m5 = s1 * (FLOAT16)4 - s3 * (FLOAT16)5 + s5; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); + Vec::save(dstStart + 2 * dstStep, m2); + Vec::save(dstStart + 3 * dstStep, m3); + Vec::save(dstStart + 4 * dstStep, m4); + Vec::save(dstStart + 5 * dstStep, m5); +} + +static void _destTransformUnit6x5(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + Vec s4 = Vec::load(srcBlock + 4 * srcStep); + Vec s5 = Vec::load(srcBlock + 5 * srcStep); + + auto m0 = s0 + s1 + s2 + s3 + s4; + auto m1 = (s1 - s2) + (s3 - s4) * (FLOAT16)2; + auto m2 = (s1 + s2) + (s3 + s4) * (FLOAT16)4; + auto m3 = (s1 - s2) + (s3 - s4) * (FLOAT16)8; + auto m4 = (s1 + s2) + (s3 + s4) * (FLOAT16)16 + s5; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); + Vec::save(dstStart + 2 * dstStep, m2); + Vec::save(dstStart + 3 * dstStep, m3); + Vec::save(dstStart + 4 * dstStep, m4); +} +static void _destTransformUnit6x4(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + Vec s4 = Vec::load(srcBlock + 4 * srcStep); + Vec s5 = Vec::load(srcBlock + 5 * srcStep); + auto v0 = s3 + s4; + auto v1 = s3 - s4; + auto v2 = s1 + s2; + auto v3 = s1 - s2; + + auto m0 = s0 + v2 + v0; + auto m1 = v3 + v1 + v1; + auto m2 = v2 + v0 * (FLOAT16)4; + auto m3 = v3 + v1 * (FLOAT16)8 + s5; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); + Vec::save(dstStart + 2 * dstStep, m2); + Vec::save(dstStart + 3 * dstStep, m3); +} +static void _destTransformUnit6x3(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + Vec s4 = Vec::load(srcBlock + 4 * srcStep); + Vec s5 = Vec::load(srcBlock + 5 * srcStep); + + auto m0 = s0 + s1 + s2 + s3 + s4; + auto m1 = (s1 - s2) + (s3 - s4) * (FLOAT16)2; + auto m2 = (s1 + s2) + (s3 + s4) * (FLOAT16)4 + s5; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); + Vec::save(dstStart + 2 * dstStep, m2); +} +static void _destTransformUnit6x2(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep) { + Vec s0 = Vec::load(srcBlock + 0 * srcStep); + Vec s1 = Vec::load(srcBlock + 1 * srcStep); + Vec s2 = Vec::load(srcBlock + 2 * srcStep); + Vec s3 = Vec::load(srcBlock + 3 * srcStep); + Vec s4 = Vec::load(srcBlock + 4 * srcStep); + Vec s5 = Vec::load(srcBlock + 5 * srcStep); + + auto m0 = s0 + s1 + s2 + s3 + s4; + auto m1 = (s1 - s2) + (s3 - s4) * (FLOAT16)2 + s5; + + Vec::save(dstStart + 0 * dstStep, m0); + Vec::save(dstStart + 1 * dstStep, m1); +} + +static Arm82WinogradFunction::TransformFunc gProcUnit6[] = { + nullptr, // 0 + nullptr, // 1 + _destTransformUnit6x2, + _destTransformUnit6x3, + _destTransformUnit6x4, + _destTransformUnit6x5, +}; + + +Arm82WinogradFunction::TransformFunc Arm82WinogradFunction::chooseSourceTransform(int k, int w) { + if (6 == k && 6 == w) { + return _sourceTransformUnit6x6; + } + if (4 == k && 4 == w) { + return _sourceTransformUnit4x4; + } + MNN_ASSERT(false); + return nullptr; +} + +Arm82WinogradFunction::TransformFunc Arm82WinogradFunction::chooseDestTransform(int k, int h) { + if (6 == k) { + if (h <= 1 || h > 5) { + return nullptr; + } + return gProcUnit6[h]; + } + if (2 == h && 4 == k) { + return _destTransformUnit4x2; + } + if (3 == h && 4 == k) { + return _destTransformUnit4x3; + } + return nullptr; +} + +int Arm82MNNGetConvTileNumber() { + int eP, lP, hP; + Arm82MNNGetMatMulPackMode(&eP, &lP, &hP); + return eP; // 8 +} + +} // namespace MNN +#endif diff --git a/source/backend/arm82/Arm82WinogradOptFunc.hpp b/source/backend/arm82/Arm82WinogradOptFunc.hpp new file mode 100644 index 00000000..ab4df0e0 --- /dev/null +++ b/source/backend/arm82/Arm82WinogradOptFunc.hpp @@ -0,0 +1,30 @@ +// +// Arm82WinogradOptFunc.hpp +// MNN +// +// Created by MNN on 2018/10/08. +// Copyright © 2018, Alibaba Group Holding Limited +// +#if defined(__ANDROID__) || defined(__aarch64__) + +#ifndef Arm82WinogradOptFunc_hpp +#define Arm82WinogradOptFunc_hpp + +#include "Arm82Backend.hpp" + +namespace MNN { +class Arm82WinogradFunction { +public: + typedef void (*TransformFunc)(const FLOAT16* srcBlock, FLOAT16* dstStart, size_t srcStep, size_t dstStep); + + /*Use the generator with interp 0.5*/ + static TransformFunc chooseSourceTransform(int k, int w); + static TransformFunc chooseDestTransform(int k, int h); +}; + +int Arm82MNNGetConvTileNumber(); + +} // namespace MNN + +#endif /* Arm82WinogradOptFunc_hpp */ +#endif diff --git a/source/backend/arm82/CMakeLists.txt b/source/backend/arm82/CMakeLists.txt index 3b918088..466ea304 100644 --- a/source/backend/arm82/CMakeLists.txt +++ b/source/backend/arm82/CMakeLists.txt @@ -1,22 +1,18 @@ -file(GLOB MNN_ARM82_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp") +file(GLOB MNN_ARM82_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/compute/*.cpp") -set(COMPILE_ARM64 OFF) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64") - set(COMPILE_ARM64 ON) -endif() - -file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*") - -add_library( - MNN_Arm82 - OBJECT - ${MNN_ARM82_SRCS} - ${MNN_ARM82_SRCS_ASM} - ) - -if(COMPILE_ARM64) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?") + file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*") + add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM}) + target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64") + file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*") + add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM}) target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16) +else() +# Building fat binary requires multiple seperate builds and lipo-by-hand under CMake's design endif() +target_include_directories(MNN_Arm82 PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_include_directories(MNN_Arm82 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/compute/) target_include_directories(MNN_Arm82 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/asm/) diff --git a/source/backend/arm82/asm/arm32/Arm82MNNPackForMatMul_A.S b/source/backend/arm82/asm/arm32/Arm82MNNPackForMatMul_A.S new file mode 100644 index 00000000..f2f666cd --- /dev/null +++ b/source/backend/arm82/asm/arm32/Arm82MNNPackForMatMul_A.S @@ -0,0 +1,525 @@ +// +// MNNPackC4ForMatMul_A.S +// MNN +// +// Created by MNN on 2020/06/10. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +asm_function Arm82MNNPackForMatMul_A +//void Arm82MNNPackForMatMul_A(FLOAT16* destOrigin, FLOAT16 const** sourceGroup, const int32_t* info, const int32_t* el) +//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el +push {r4-r11, lr} +vpush {q4-q7} +ldr r10, [r2, #0] // number +ldr r4, [r2, #4] // eReal +ldr r11, [r2, #8] // eDest +ldr r6, [r2, #12] // xOffset +// xOffset -> xOffset * 8 * sizeof(FLOAT16) +// eReal -> eReal * 8 * sizeof(FLOAT16) +// eDest -> eDest * sizeof(FLOAT16) +mov r12, #2 // sizeof(FLOAT16) +mov r9, #16 // 8 * sizeof(FLOAT16) +mul r4, r9, r4 +mul r11, r12, r11 +mul r6, r9, r6 + +LoopNumber: +ldr r5, [r3, #4] // l +ldr r8, [r3, #8] // eOffset +ldr r7, [r3, #12] // lOffset + +push {r0, r1} +ldr r1, [r1, #0] + +// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float) +mul r7, r11, r7 +lsl r8, r8, #1 // r8 = r8 * sizeof(FLOAT16) +add r0, r0, r7 +add r0, r0, r8 + +ldr r2, [r3, #0] // e + +E12: +cmp r2, #12 +blt E8 + cmp r5, #8 + blt E12_LoopLExtra + E12_LoopL8: + mov r12, r1 +// {q0-q11} => [[d0, d8, d16], +// [d2, d10, d18], +// [d4, d12, d20], +// [d6, d14, d22], +// [d1, d9, d17], +// [d3, d11, d19], +// [d5, d13, d21], +// [d7, d15, d23]] +.macro TRANSPOSE_8X12 + vld1.16 {q0}, [r1], r6 + vld1.16 {q1}, [r1], r6 + vld1.16 {q2}, [r1], r6 + vld1.16 {q3}, [r1], r6 + vld1.16 {q4}, [r1], r6 + vld1.16 {q5}, [r1], r6 + vld1.16 {q6}, [r1], r6 + vld1.16 {q7}, [r1], r6 + vld1.16 {q8}, [r1], r6 + vld1.16 {q9}, [r1], r6 + vld1.16 {q10}, [r1], r6 + vld1.16 {q11}, [r1], r6 + vtrn.16 d0, d2 + vtrn.16 d4, d6 + vtrn.16 d1, d3 + vtrn.16 d5, d7 + vtrn.16 d8, d10 + vtrn.16 d12, d14 + vtrn.16 d9, d11 + vtrn.16 d13, d15 + vtrn.16 d16, d18 + vtrn.16 d20, d22 + vtrn.16 d17, d19 + vtrn.16 d21, d23 + vtrn.32 d0, d4 + vtrn.32 d2, d6 + vtrn.32 d1, d5 + vtrn.32 d3, d7 + vtrn.32 d8, d12 + vtrn.32 d10, d14 + vtrn.32 d9, d13 + vtrn.32 d11, d15 + vtrn.32 d16, d20 + vtrn.32 d18, d22 + vtrn.32 d17, d21 + vtrn.32 d19, d23 +.endm +.macro STORE_LINE_12 addr, v0, v1, v2 + vst1.16 {\v0}, [\addr]! + vst1.16 {\v1}, [\addr]! + vst1.16 {\v2}, [\addr]! +.endm + TRANSPOSE_8X12 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + STORE_LINE_12 r0, d4, d12, d20 + STORE_LINE_12 r0, d6, d14, d22 + STORE_LINE_12 r0, d1, d9, d17 + STORE_LINE_12 r0, d3, d11, d19 + STORE_LINE_12 r0, d5, d13, d21 + STORE_LINE_12 r0, d7, d15, d23 + + add r1, r12, r4 + sub r5, r5, #8 + cmp r5, #8 + bge E12_LoopL8 + + cmp r5, #0 + beq E12_LoopLEnd + E12_LoopLExtra: + TRANSPOSE_8X12 + E12_LoopL7: + cmp r5, #7 // if r5 < 7 + blt E12_LoopL6 // jump to E12_LoopL6 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + STORE_LINE_12 r0, d4, d12, d20 + STORE_LINE_12 r0, d6, d14, d22 + STORE_LINE_12 r0, d1, d9, d17 + STORE_LINE_12 r0, d3, d11, d19 + STORE_LINE_12 r0, d5, d13, d21 + b E12_LoopLEnd + E12_LoopL6: + cmp r5, #6 // if r5 < 6 + blt E12_LoopL5 // jump to E12_LoopL5 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + STORE_LINE_12 r0, d4, d12, d20 + STORE_LINE_12 r0, d6, d14, d22 + STORE_LINE_12 r0, d1, d9, d17 + STORE_LINE_12 r0, d3, d11, d19 + b E12_LoopLEnd + E12_LoopL5: + cmp r5, #5 // if r5 < 5 + blt E12_LoopL4 // jump to E12_LoopL4 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + STORE_LINE_12 r0, d4, d12, d20 + STORE_LINE_12 r0, d6, d14, d22 + STORE_LINE_12 r0, d1, d9, d17 + b E12_LoopLEnd + E12_LoopL4: + cmp r5, #4 // if r5 < 4 + blt E12_LoopL3 // jump to E12_LoopL3 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + STORE_LINE_12 r0, d4, d12, d20 + STORE_LINE_12 r0, d6, d14, d22 + b E12_LoopLEnd + E12_LoopL3: + cmp r5, #3 // if r5 < 3 + blt E12_LoopL2 // jump to E12_LoopL2 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + STORE_LINE_12 r0, d4, d12, d20 + b E12_LoopLEnd + E12_LoopL2: + cmp r5, #2 // if r5 < 2 + blt E12_LoopL1 // jump to E12_LoopL1 + STORE_LINE_12 r0, d0, d8, d16 + STORE_LINE_12 r0, d2, d10, d18 + b E12_LoopLEnd + E12_LoopL1: + STORE_LINE_12 r0, d0, d8, d16 + E12_LoopLEnd: + b End + +E8: +cmp r2, #8 +blt E4 + sub r11, r11, #8 + mov r9, r5 + mov r7, r1 + mov r8, r0 + cmp r5, #8 + blt E8_LoopLExtra + E8_LoopL8: + mov r12, r1 +// {q0-q7} => [[d0, d8], +// [d2, d10], +// [d4, d12], +// [d6, d14], +// [d1, d9], +// [d3, d11], +// [d5, d13], +// [d7, d15]] +.macro TRANSPOSE_8X8 + vld1.16 {q0}, [r1], r6 + vld1.16 {q1}, [r1], r6 + vld1.16 {q2}, [r1], r6 + vld1.16 {q3}, [r1], r6 + vld1.16 {q4}, [r1], r6 + vld1.16 {q5}, [r1], r6 + vld1.16 {q6}, [r1], r6 + vld1.16 {q7}, [r1], r6 + vtrn.16 d0, d2 + vtrn.16 d4, d6 + vtrn.16 d1, d3 + vtrn.16 d5, d7 + vtrn.16 d8, d10 + vtrn.16 d12, d14 + vtrn.16 d9, d11 + vtrn.16 d13, d15 + vtrn.32 d0, d4 + vtrn.32 d2, d6 + vtrn.32 d1, d5 + vtrn.32 d3, d7 + vtrn.32 d8, d12 + vtrn.32 d10, d14 + vtrn.32 d9, d13 + vtrn.32 d11, d15 +.endm +.macro STORE_LINE_8 addr, offset, v0, v1 + vst1.16 {\v0}, [\addr]! + vst1.16 {\v1}, [\addr], \offset +.endm + TRANSPOSE_8X8 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + STORE_LINE_8 r0, r11, d4, d12 + STORE_LINE_8 r0, r11, d6, d14 + STORE_LINE_8 r0, r11, d1, d9 + STORE_LINE_8 r0, r11, d3, d11 + STORE_LINE_8 r0, r11, d5, d13 + STORE_LINE_8 r0, r11, d7, d15 + add r1, r12, r4 + sub r5, r5, #8 + cmp r5, #8 + bge E8_LoopL8 + + cmp r5, #0 + beq E8_LoopLEnd + E8_LoopLExtra: + TRANSPOSE_8X8 + E8_LoopL7: + cmp r5, #7 // if r5 < 7 + blt E8_LoopL6 // jump to E8_LoopL6 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + STORE_LINE_8 r0, r11, d4, d12 + STORE_LINE_8 r0, r11, d6, d14 + STORE_LINE_8 r0, r11, d1, d9 + STORE_LINE_8 r0, r11, d3, d11 + STORE_LINE_8 r0, r11, d5, d13 + b E8_LoopLEnd + E8_LoopL6: + cmp r5, #6 // if r5 < 6 + blt E8_LoopL5 // jump to E8_LoopL5 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + STORE_LINE_8 r0, r11, d4, d12 + STORE_LINE_8 r0, r11, d6, d14 + STORE_LINE_8 r0, r11, d1, d9 + STORE_LINE_8 r0, r11, d3, d11 + b E8_LoopLEnd + E8_LoopL5: + cmp r5, #5 // if r5 < 5 + blt E8_LoopL4 // jump to E8_LoopL4 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + STORE_LINE_8 r0, r11, d4, d12 + STORE_LINE_8 r0, r11, d6, d14 + STORE_LINE_8 r0, r11, d1, d9 + b E8_LoopLEnd + E8_LoopL4: + cmp r5, #4 // if r5 < 4 + blt E8_LoopL3 // jump to E8_LoopL3 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + STORE_LINE_8 r0, r11, d4, d12 + STORE_LINE_8 r0, r11, d6, d14 + b E8_LoopLEnd + E8_LoopL3: + cmp r5, #3 // if r5 < 3 + blt E8_LoopL2 // jump to E8_LoopL2 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + STORE_LINE_8 r0, r11, d4, d12 + b E8_LoopLEnd + E8_LoopL2: + cmp r5, #2 // if r5 < 2 + blt E8_LoopL1 // jump to E8_LoopL1 + STORE_LINE_8 r0, r11, d0, d8 + STORE_LINE_8 r0, r11, d2, d10 + b E8_LoopLEnd + E8_LoopL1: + STORE_LINE_8 r0, r11, d0, d8 + E8_LoopLEnd: + add r11, r11, #8 + lsl r1, r6, #3 + add r1, r7, r1 + sub r2, r2, #8 + add r0, r8, #16 // 8 * sizeof(FLOAT16) + mov r5, r9 + +E4: +cmp r2, #4 +blt E1 + mov r9, r5 + mov r7, r1 + mov r8, r0 + cmp r5, #8 + blt E4_LoopLExtra + E4_LoopL8: + mov r12, r1 +// {q0-q3} => [[d0], +// [d2], +// [d4], +// [d6], +// [d1], +// [d3], +// [d5], +// [d7]] +.macro TRANSPOSE_8X4 + vld1.16 {q0}, [r1], r6 + vld1.16 {q1}, [r1], r6 + vld1.16 {q2}, [r1], r6 + vld1.16 {q3}, [r1], r6 + vtrn.16 d0, d2 + vtrn.16 d4, d6 + vtrn.16 d1, d3 + vtrn.16 d5, d7 + vtrn.32 d0, d4 + vtrn.32 d2, d6 + vtrn.32 d1, d5 + vtrn.32 d3, d7 +.endm + TRANSPOSE_8X4 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + vst1.16 {d4}, [r0], r11 + vst1.16 {d6}, [r0], r11 + vst1.16 {d1}, [r0], r11 + vst1.16 {d3}, [r0], r11 + vst1.16 {d5}, [r0], r11 + vst1.16 {d7}, [r0], r11 + + add r1, r12, r4 + sub r5, r5, #8 + cmp r5, #8 + bge E4_LoopL8 + + cmp r5, #0 + beq E4_LoopLEnd + E4_LoopLExtra: + TRANSPOSE_8X4 + E4_LoopL7: + cmp r5, #7 // if r5 < 7 + blt E4_LoopL6 // jump to E4_LoopL6 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + vst1.16 {d4}, [r0], r11 + vst1.16 {d6}, [r0], r11 + vst1.16 {d1}, [r0], r11 + vst1.16 {d3}, [r0], r11 + vst1.16 {d5}, [r0], r11 + b E4_LoopLEnd + E4_LoopL6: + cmp r5, #6 // if r5 < 6 + blt E4_LoopL5 // jump to E4_LoopL5 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + vst1.16 {d4}, [r0], r11 + vst1.16 {d6}, [r0], r11 + vst1.16 {d1}, [r0], r11 + vst1.16 {d3}, [r0], r11 + b E4_LoopLEnd + E4_LoopL5: + cmp r5, #5 // if r5 < 5 + blt E4_LoopL4 // jump to E4_LoopL4 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + vst1.16 {d4}, [r0], r11 + vst1.16 {d6}, [r0], r11 + vst1.16 {d1}, [r0], r11 + b E4_LoopLEnd + E4_LoopL4: + cmp r5, #4 // if r5 < 4 + blt E4_LoopL3 // jump to E4_LoopL3 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + vst1.16 {d4}, [r0], r11 + vst1.16 {d6}, [r0], r11 + b E4_LoopLEnd + E4_LoopL3: + cmp r5, #3 // if r5 < 3 + blt E4_LoopL2 // jump to E4_LoopL2 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + vst1.16 {d4}, [r0], r11 + b E4_LoopLEnd + E4_LoopL2: + cmp r5, #2 // if r5 < 2 + blt E4_LoopL1 // jump to E4_LoopL1 + vst1.16 {d0}, [r0], r11 + vst1.16 {d2}, [r0], r11 + b E4_LoopLEnd + E4_LoopL1: + vst1.16 {d0}, [r0], r11 + E4_LoopLEnd: + lsl r1, r6, #2 + add r1, r7, r1 + sub r2, r2, #4 + add r0, r8, #8 // 4 * sizeof(FLOAT16) + mov r5, r9 + +E1: +cmp r2, #0 +beq End +LoopE1: + mov r9, r5 + mov r7, r1 + mov r8, r0 + cmp r5, #8 + blt E1_LoopL7 + E1_LoopL8: + vld1.16 {q0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + vst1.16 {d0[3]}, [r0], r11 + vst1.16 {d1[0]}, [r0], r11 + vst1.16 {d1[1]}, [r0], r11 + vst1.16 {d1[2]}, [r0], r11 + vst1.16 {d1[3]}, [r0], r11 + sub r5, r5, #8 + cmp r5, #8 + bge E1_LoopL8 + + E1_LoopL7: + cmp r5, #7 // if r5 < 7 + blt E1_LoopL6 // jump to E1_LoopL6 + vld1.16 {q0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + vst1.16 {d0[3]}, [r0], r11 + vst1.16 {d1[0]}, [r0], r11 + vst1.16 {d1[1]}, [r0], r11 + vst1.16 {d1[2]}, [r0], r11 + b E1_LoopLEnd + E1_LoopL6: + cmp r5, #6 // if r5 < 6 + blt E1_LoopL5 // jump to E1_LoopL5 + vld1.16 {q0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + vst1.16 {d0[3]}, [r0], r11 + vst1.16 {d1[0]}, [r0], r11 + vst1.16 {d1[1]}, [r0], r11 + b E1_LoopLEnd + E1_LoopL5: + cmp r5, #5 // if r5 < 5 + blt E1_LoopL4 // jump to E1_LoopL4 + vld1.16 {q0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + vst1.16 {d0[3]}, [r0], r11 + vst1.16 {d1[0]}, [r0], r11 + b E1_LoopLEnd + E1_LoopL4: + cmp r5, #4 // if r5 < 4 + blt E1_LoopL3 // jump to E1_LoopL3 + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + vst1.16 {d0[3]}, [r0], r11 + b E1_LoopLEnd + E1_LoopL3: + cmp r5, #3 // if r5 < 3 + blt E1_LoopL2 // jump to E1_LoopL2 + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + b E1_LoopLEnd + E1_LoopL2: + cmp r5, #2 // if r5 < 2 + blt E1_LoopL1 // jump to E1_LoopL1 + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + b E1_LoopLEnd + E1_LoopL1: + cmp r5, #1 // if r5 < 1 + blt E1_LoopLEnd + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + E1_LoopLEnd: + subs r2, r2, #1 + add r0, r8, #2 + add r1, r7, r6 + mov r5, r9 + bne LoopE1 + +End: + +pop {r0, r1} +subs r10, r10, #1 +add r3, r3, #16 +add r1, r1, #4 + +bne LoopNumber +vpop {q4-q7} +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S new file mode 100644 index 00000000..44ab3fd3 --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S @@ -0,0 +1,110 @@ +// +// MNNConvDwF23MulTransUnitFP16.S +// MNN +// +// Created by MNN on 2019/4/4. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNConvDwF23MulTransUnitFP16 +//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow); +//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow +push {r4-r7, lr} +vpush {q4-q7} +ldr r4, [r0, #0] +ldr r5, [r0, #4] +ldr r6, [r0, #8] + +vld1.16 {q4, q5}, [r1]! +vld1.16 {q6, q7}, [r1]! +vld1.16 {q8, q9}, [r1]! + +L2: +cmp r3, #2 +blt L1 + +LoopL2: +mov r7, r1 + +vld1.16 {q12, q13}, [r4]! +vmul.f16 q0, q4, q12 +vld1.16 {q14, q15}, [r4]! +vmul.f16 q1, q5, q13 +vld1.16 {q10, q11}, [r7]! +vmul.f16 q2, q6, q14 +vld1.16 {q12, q13}, [r5]! +vmul.f16 q3, q7, q15 + +vmla.f16 q0, q8, q12 +vld1.16 {q14, q15}, [r5]! +vmla.f16 q1, q9, q13 +vmla.f16 q2, q10, q14 +vmla.f16 q3, q11, q15 + +vld1.16 {q10, q11}, [r7]! +vld1.16 {q12, q13}, [r6]! +vmla.f16 q0, q10, q12 +vmla.f16 q1, q11, q13 +vld1.16 {q10, q11}, [r7]! +vadd.f16 q0, q1, q0 +vld1.16 {q14, q15}, [r6]! + +vmla.f16 q2, q10, q14 +vmla.f16 q3, q11, q15 +vadd.f16 q0, q0, q2 + +vadd.f16 q3, q3, q1 +vsub.f16 q1, q3, q2 + +vst1.16 {q0, q1}, [r2]! + +sub r3, r3, #2 +cmp r3, #2 +bge LoopL2 + + +L1: +cmp r3, #0 +beq End +mov r7, r1 +mov r12, #32 +vld1.16 {q12, q13}, [r4]! +vmul.f16 q0, q4, q12 +vld1.16 {q14}, [r4]! +vmul.f16 q1, q5, q13 +vld1.16 {q10}, [r7], r12 +vmul.f16 q2, q6, q14 +vld1.16 {q12, q13}, [r5]! + +vmla.f16 q0, q8, q12 +vld1.16 {q14}, [r5]! +vmla.f16 q1, q9, q13 +vmla.f16 q2, q10, q14 + +vld1.16 {q10, q11}, [r7]! +vld1.16 {q12, q13}, [r6]! +vmla.f16 q0, q10, q12 +vmla.f16 q1, q11, q13 +vld1.16 {q10}, [r7] +vld1.16 {q14}, [r6]! + +vmla.f16 q2, q10, q14 + +vadd.f16 q0, q1, q0 +vadd.f16 q0, q0, q2 + +vst1.16 {q0}, [r2]! +End: + +vpop {q4-q7} +pop {r4-r7, pc} + +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S new file mode 100644 index 00000000..f2fb6771 --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S @@ -0,0 +1,60 @@ +// +// MNNConvDwF23SourceTransUnitFP16.S +// MNN +// +// Created by MNN on 2019/4/4. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNConvDwF23SourceTransUnitFP16 +// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit); + +//Auto: +//r0: source, r1:dest, r2:unit + +push {lr} + +L1: +cmp r2, #0 +beq End + +vld1.16 {q8, q9}, [r0]! +vld1.16 {q10, q11}, [r0]! +subs r2, r2, #1 +vsub.f16 q0, q8, q10 +vadd.f16 q1, q9, q10 +beq L1LoopEnd + +L1Loop: + vsub.f16 q2, q10, q9 + vst1.16 {q0, q1}, [r1]! + vsub.f16 q3, q11, q9 + vmov.i32 q8, q10 + vst1.16 {q2, q3}, [r1]! + vmov.i32 q9, q11 + vld1.16 {q10, q11}, [r0]! + vsub.f16 q0, q8, q10 + vadd.f16 q1, q9, q10 + + subs r2, r2, #1 + bne L1Loop +L1LoopEnd: +vsub.f16 q2, q10, q9 +vsub.f16 q3, q11, q9 + +vst1.16 {q0, q1}, [r1]! +vst1.16 {q2, q3}, [r1]! + + +End: + +pop {pc} +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S new file mode 100644 index 00000000..240c9b17 --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S @@ -0,0 +1,208 @@ +// +// MNNConvRunForLineDepthwiseFP16.S +// MNN +// +// Created by MNN on 2019/02/04. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNConvRunForLineDepthwiseFP16 +//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) + + +//Auto Load: +//r0:dst, r1:src, r2:weight, r3:width + +push {r4-r11, lr} + +//Load From Sp +//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep +ldr r4, [sp, #36] +ldr r5, [sp, #40] +ldr r6, [sp, #44] +ldr r7, [sp, #48] +ldr r8, [sp, #52] +ldr r9, [sp, #56] +ldr r10, [sp, #60] +ldr r11, [sp, #64] + +vpush {q4-q7} + +mov r12, #2 // sizeof(FLOAT16) +mul r4, r12, r4 +mul r7, r12, r7 +mul r8, r12, r8 +mul r10, r12, r10 +mul r11, r12, r11 + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul r12, r5, r7 +sub r8, r8, r12 + +LoopDY: +push {r0, r1, r3, r9, r10, r11} + +L8: +cmp r3, #7 +ble L4 + +mov r12, #8 +mul r12, r4, r12 + +L8Loop: + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + vmov.i32 q12, #0 + vmov.i32 q13, #0 + vmov.i32 q14, #0 + vmov.i32 q15, #0 + + vmov.i32 d14[0], r1 + vmov.i32 d14[1], r2 + mov r9, r6 + L8LoopH: + mov r10, r5 + L8LoopW: + vld1.16 {q3}, [r2]! + vld1.16 {q0}, [r1], r4 + subs r10, r10, #1 + vmla.f16 q8, q3, q0 + vld1.16 {q1}, [r1], r4 + vmla.f16 q9, q3, q1 + vld1.16 {q0}, [r1], r4 + vmla.f16 q10, q0, q3 + vld1.16 {q1}, [r1], r4 + vmla.f16 q11, q1, q3 + vld1.16 {q0}, [r1], r4 + vmla.f16 q12, q0, q3 + vld1.16 {q1}, [r1], r4 + vmla.f16 q13, q1, q3 + vld1.16 {q0}, [r1], r4 + vmla.f16 q14, q0, q3 + vld1.16 {q1}, [r1], r4 + vmla.f16 q15, q1, q3 + + sub r1, r1, r12 + add r1, r1, r7 + + bne L8LoopW + L8LoopWEnd: + subs r9, r9, #1 + add r1, r1, r8 + bne L8LoopH + + sub r3, r3, #8 + vst1.16 {q8, q9}, [r0]! + vmov.i32 r1, d14[0] + vmov.i32 r2, d14[1] + vst1.16 {q10, q11}, [r0]! + add r1, r1, r12 + vst1.16 {q12, q13}, [r0]! + cmp r3, #8 + vst1.16 {q14, q15}, [r0]! + bge L8Loop + +L4: +cmp r3, #3 +ble L1 + +mov r12, #4 +mul r12, r4, r12 + +L4Loop: + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + + vmov.i32 d8[0], r1 + vmov.i32 d9[0], r2 + mov r9, r6 + L4LoopH: + mov r10, r5 + L4LoopW: + vld1.16 {q12}, [r2]! + vld1.16 {q0}, [r1], r4 + subs r10, r10, #1 + vmla.f16 q8, q12, q0 + vld1.16 {q1}, [r1], r4 + vmla.f16 q9, q12, q1 + vld1.16 {q2}, [r1], r4 + vmla.f16 q10, q2, q12 + vld1.16 {q3}, [r1], r4 + sub r1, r1, r12 + vmla.f16 q11, q3, q12 + + add r1, r1, r7 + + bne L4LoopW + subs r9, r9, #1 + add r1, r1, r8 + bne L4LoopH + + sub r3, r3, #4 + vst1.16 {q8, q9}, [r0]! + vmov.i32 r1, d8[0] + vmov.i32 r2, d9[0] + vst1.16 {q10, q11}, [r0]! + add r1, r1, r12 + cmp r3, #4 + bge L4Loop + + + + +L1: +cmp r3, #0 +beq End + +L1Loop: + vmov.i32 q0, #0 + mov r9, r6 + mov r11, r1 + mov r12, r2 + L1LoopH: + mov r10, r5 + L1LoopW: + vld1.16 {q1}, [r1], r7 + vld1.16 {q2}, [r2]! + vmla.f16 q0, q1, q2 + subs r10, r10, #1 + bne L1LoopW + subs r9, r9, #1 + add r1, r1, r8 + bne L1LoopH + + subs r3, r3, #1 + vst1.16 {q0}, [r0]! + mov r2, r12 + add r1, r11, r4 + bne L1Loop + + +End: + +pop {r0, r1, r3, r9, r10, r11} +add r0, r0, r11 +subs r9, r9, #1 +add r1, r1, r10 +bne LoopDY + + +vpop {q4-q7} +pop {r4-r11, pc} + + +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNExpFP16.S b/source/backend/arm82/asm/arm32/MNNExpFP16.S new file mode 100644 index 00000000..1256f9ff --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNExpFP16.S @@ -0,0 +1,87 @@ +// +// MNNExpFP16.S +// MNN +// +// Created by MNN on 2019/01/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +//void MNNExpFP16(FLOAT16* dest, const FLOAT16* source, const FLOAT16* parameters, size_t block) +asm_function MNNExpFP16 + +//r0: dest, r1:source, r2:parameters, r3:countC8 +push {r4, lr} +vpush {q5, q6} + +vld1.32 {q0, q1}, [r2] + +vmov.i32 q2, #87 +vcvt.f32.s32 q2, q2 +vneg.f32 q3, q2 + +Loop: + +vld1.32 {q8, q9}, [r1]! + +vmin.f32 q8, q8, q2 +vmin.f32 q9, q9, q2 +vmax.f32 q8, q8, q3 +vmax.f32 q9, q9, q3 + +vneg.f32 q10, q8 +vneg.f32 q11, q9 + +vmul.f32 q8, q10, d0[1] +vmul.f32 q9, q11, d0[1] +vcvt.s32.f32 q8, q8 +vcvt.s32.f32 q9, q9 + +vcvt.f32.s32 q12, q8 +vcvt.f32.s32 q13, q9 + +//q10, q11: t +vmls.f32 q10, q12, d0[0] +vmls.f32 q11, q13, d0[0] + +.macro MLA_TWO z0 z1 z2 z3 +vdup.32 \z1, \z0 +vmla.f32 \z1, \z2, \z3 +.endm + +MLA_TWO d3[0], q12, q10, d3[1] +MLA_TWO d3[0], q13, q11, d3[1] +MLA_TWO d2[1], q14, q10, q12 +MLA_TWO d2[1], q15, q11, q13 +MLA_TWO d2[0], q12, q10, q14 +MLA_TWO d2[0], q13, q11, q15 +MLA_TWO d1[1], q14, q10, q12 +MLA_TWO d1[1], q15, q11, q13 +MLA_TWO d1[0], q12, q10, q14 +MLA_TWO d1[0], q13, q11, q15 + +//q12, q13 is expRemain + +vshl.i32 q8, q8, #23 +vshl.i32 q9, q9, #23 +vadd.i32 q12, q12, q8 +vadd.i32 q13, q13, q9 + +vst1.32 {q12, q13}, [r0]! + + +subs r3, r3, #1 +bne Loop + +vpop {q5, q6} +pop {r4, pc} + + +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNPackedMatMulFP16.S b/source/backend/arm82/asm/arm32/MNNPackedMatMulFP16.S new file mode 100644 index 00000000..b3a77feb --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNPackedMatMulFP16.S @@ -0,0 +1,155 @@ +// +// MNNPackedMatMulFP16.S +// MNN +// +// Created by MNN on 2020/06/10. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 12 * 4 MatMul +asm_function MNNPackedMatMulFP16 +//void MNNPackedMatMulFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias); +// Auto: r0: C, r1:A, r2:B, r3:parameter +// Load from sp: r5: postParameters, r6:bias + +push {r4-r11, lr} +ldr r5, [sp, #36] +ldr r6, [sp, #40] + +ldr r4, [r3, #8] // h +ldr r7, [r3, #4] // l +add r4, r4, #7 +ldr r8, [r3, #12]//cStride +ldr r3, [r3, #20]//bExtraStride +lsr r4, r4, #3 + +sub r8, r8, #192 + +vpush {q4-q7} +// q0, q1: src +// q3: weight +// q4 - q15: dst + +cmp r5, #0 +beq LoopH +vld1.32 {q0}, [r5] +vcvt.f16.f32 d0, q0 + +.macro COMPUTE op, s0, s1, d0, d1, d2, d3 + \op \d0, \s0, \s1[0] + \op \d1, \s0, \s1[1] + \op \d2, \s0, \s1[2] + \op \d3, \s0, \s1[3] +.endm + +.macro CLIP op, s0, d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11 + \op \d0, \d0, \s0 + \op \d1, \d1, \s0 + \op \d2, \d2, \s0 + \op \d3, \d3, \s0 + \op \d4, \d4, \s0 + \op \d5, \d5, \s0 + \op \d6, \d6, \s0 + \op \d7, \d7, \s0 + \op \d8, \d8, \s0 + \op \d9, \d9, \s0 + \op \d10, \d10, \s0 + \op \d11, \d11, \s0 +.endm + +LoopH: + subs r12, r7, #1 + mov r11, r1 + vld1.16 {q3}, [r2]! + vld1.16 {d1, d2, d3}, [r11]! + COMPUTE vmul.f16, q3, d1, q4, q5, q6, q7 + COMPUTE vmul.f16, q3, d2, q8, q9, q10, q11 + COMPUTE vmul.f16, q3, d3, q12, q13, q14, q15 + beq LoopLEnd + LoopL: + vld1.16 {q3}, [r2]! + vld1.16 {d1, d2, d3}, [r11]! + COMPUTE vmla.f16, q3, d1, q4, q5, q6, q7 + COMPUTE vmla.f16, q3, d2, q8, q9, q10, q11 + COMPUTE vmla.f16, q3, d3, q12, q13, q14, q15 + + subs r12, r12, #1 + bne LoopL + LoopLEnd: + cmp r5, #0 + beq Store + vld1.16 {q3}, [r6]! + vmla.f16 q4, q3, d0[1] + vmla.f16 q5, q3, d0[1] + vmla.f16 q6, q3, d0[1] + vmla.f16 q7, q3, d0[1] + vmla.f16 q8, q3, d0[1] + vmla.f16 q9, q3, d0[1] + vmla.f16 q10, q3, d0[1] + vmla.f16 q11, q3, d0[1] + vmla.f16 q12, q3, d0[1] + vmla.f16 q13, q3, d0[1] + vmla.f16 q14, q3, d0[1] + vmla.f16 q15, q3, d0[1] + + b PostTreat + + LoadOrigin: + mov r11, r0 + vld1.16 {q1, q2}, [r11]! + vmla.f16 q4, q1, d0[1] + vmla.f16 q5, q2, d0[1] + + vld1.16 {q1, q2}, [r11]! + vmla.f16 q6, q1, d0[1] + vmla.f16 q7, q2, d0[1] + + vld1.16 {q1, q2}, [r11]! + vmla.f16 q8, q1, d0[1] + vmla.f16 q9, q2, d0[1] + + vld1.16 {q1, q2}, [r11]! + vmla.f16 q10, q1, d0[1] + vmla.f16 q11, q2, d0[1] + + vld1.16 {q1, q2}, [r11]! + vmla.f16 q12, q1, d0[1] + vmla.f16 q13, q2, d0[1] + + vld1.16 {q1, q2}, [r11]! + vmla.f16 q14, q1, d0[1] + vmla.f16 q15, q2, d0[1] + + PostTreat: + vdup.16 q2, d0[2] // min + vdup.16 q1, d0[3] // max + + CLIP vmax.f16, q2, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15 + CLIP vmin.f16, q1, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15 + + Store: + vst1.16 {q4, q5}, [r0]! + vst1.16 {q6, q7}, [r0]! + vst1.16 {q8, q9}, [r0]! + vst1.16 {q10, q11}, [r0]! + vst1.16 {q12, q13}, [r0]! + vst1.16 {q14, q15}, [r0]! + + add r0, r0, r8 + add r2, r2, r3 + + subs r4, r4, #1 + bne LoopH + +End: +vpop {q4-q7} +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNPackedMatMulRemainFP16.S b/source/backend/arm82/asm/arm32/MNNPackedMatMulRemainFP16.S new file mode 100644 index 00000000..5cc4f5c4 --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNPackedMatMulRemainFP16.S @@ -0,0 +1,211 @@ +// +// MNNPackedMatMulRemainFP16.S +// MNN +// +// Created by MNN on 2020/06/10. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 12 * 8 MatMul +asm_function MNNPackedMatMulRemainFP16 +//void MNNPackedMatMulRemainFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t eSize, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias); +//Auto r0: C, r1:A, r2:B, r3:eSize, +//r4:parameter, r5: cache no usage, r6:postParameters, r7:bias + +// r4: h, r8: l, r9: tmp r0, r10: tmp r1, r11: tmp r2, r12: aStride + +push {r4-r11, lr} +ldr r4, [sp, #36] +ldr r6, [sp, #40] +ldr r7, [sp, #44] +ldr r12, [r4, #0] +vpush {q4} +cmp r6, #0 +beq E8 +// q0-q2 +vld1.32 {q0}, [r6] +vcvt.f16.f32 d0, q0 +vdup.16 q1, d0[2] // min +vdup.16 q2, d0[3] // max + +.macro COMPUTE op, s0, s1, d0, d1, d2, d3 + \op \d0, \s0, \s1[0] + \op \d1, \s0, \s1[1] + \op \d2, \s0, \s1[2] + \op \d3, \s0, \s1[3] +.endm + +.macro CLIP op, s0, d0, d1, d2, d3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s0 + \op \d2, \d2, \s0 + \op \d3, \d3, \s0 +.endm + +.macro ADD_BIAS s0, d0, d1, d2, d3 + vmla.f16 \d0, \s0, d0[1] + vmla.f16 \d1, \s0, d0[1] + vmla.f16 \d2, \s0, d0[1] + vmla.f16 \d3, \s0, d0[1] +.endm + +E8: +cmp r3, #8 +blt E4 +LoopE8: + ldr r5, [r4, #8] // h + add r5, r5, #7 + lsr r5, r5, #3 + mov r9, r0 + mov r11, r2 + push {r7} + LoopE8H: + mov r10, r1 + ldr r8, [r4, #4] // l + subs r8, r8, #1 + vld1.16 {q3}, [r10], r12 + vld1.16 {q4}, [r11]! + COMPUTE vmul.f16, q4, d6, q8, q9, q10, q11 + COMPUTE vmul.f16, q4, d7, q12, q13, q14, q15 + beq LoopE8LEnd + LoopE8L: + vld1.16 {q3}, [r10], r12 + vld1.16 {q4}, [r11]! + COMPUTE vmla.f16, q4, d6, q8, q9, q10, q11 + COMPUTE vmla.f16, q4, d7, q12, q13, q14, q15 + subs r8, r8, #1 + bne LoopE8L + + LoopE8LEnd: + cmp r6, #0 + beq StoreE8 + vld1.16 {q3}, [r7]! + ADD_BIAS q3, q8, q9, q10, q11 + ADD_BIAS q3, q12, q13, q14, q15 + CLIP vmax.f16, q1, q8, q9, q10, q11 + CLIP vmax.f16, q1, q12, q13, q14, q15 + CLIP vmin.f16, q2, q8, q9, q10, q11 + CLIP vmin.f16, q2, q12, q13, q14, q15 + + StoreE8: + ldr r8, [r4, #20] + add r11, r11, r8 + ldr r8, [r4, #12] + vst1.16 {q8, q9}, [r9]! + vst1.16 {q10, q11}, [r9]! + vst1.16 {q12, q13}, [r9]! + vst1.16 {q14, q15}, [r9], r8 + sub r9, r9, #96 + subs r5, r5, #1 + bne LoopE8H + sub r3, r3, #8 + add r0, r0, #128 + add r1, r1, #16 + cmp r3, #8 + pop {r7} + bge LoopE8 + + +E4: +cmp r3, #4 +blt E1 +LoopE4: + ldr r5, [r4, #8] // h + add r5, r5, #7 + lsr r5, r5, #3 + mov r9, r0 + mov r11, r2 + push {r7} + LoopE4H: + mov r10, r1 + ldr r8, [r4, #4] // l + subs r8, r8, #1 + vld1.16 {d6}, [r10], r12 + vld1.16 {q4}, [r11]! + COMPUTE vmul.f16, q4, d6, q8, q9, q10, q11 + beq LoopE4LEnd + LoopE4L: + vld1.16 {d6}, [r10], r12 + vld1.16 {q4}, [r11]! + COMPUTE vmla.f16, q4, d6, q8, q9, q10, q11 + subs r8, r8, #1 + bne LoopE4L + + LoopE4LEnd: + cmp r6, #0 + beq StoreE4 + vld1.16 {q3}, [r7]! + ADD_BIAS q3, q8, q9, q10, q11 + CLIP vmax.f16, q1, q8, q9, q10, q11 + CLIP vmin.f16, q2, q8, q9, q10, q11 + + StoreE4: + ldr r8, [r4, #20] // bExtraStride + add r11, r11, r8 + ldr r8, [r4, #12] // cStride + vst1.16 {q8, q9}, [r9]! + vst1.16 {q10, q11}, [r9], r8 + sub r9, r9, #32 + subs r5, r5, #1 + bne LoopE4H + sub r3, r3, #4 + add r0, r0, #64 + add r1, r1, #8 + cmp r3, #4 + pop {r7} + bge LoopE4 + +E1: +cmp r3, #0 +beq End +LoopE1: + ldr r5, [r4, #8] // h + add r5, r5, #7 + lsr r5, r5, #3 + mov r9, r0 + mov r11, r2 + push {r7} + LoopE1H: + mov r10, r1 + ldr r8, [r4, #4] // l + vmov.i32 q15, #0 + LoopE1L: + vld1.16 {d6[0]}, [r10], r12 + vld1.16 {q4}, [r11]! + vmla.f16 q15, q4, d6[0] + subs r8, r8, #1 + bne LoopE1L + cmp r6, #0 + beq StoreE1 + vld1.16 {q14}, [r7]! + vmla.f16 q15, q14, d0[1] + + PostTreatE1: + vmax.f16 q15, q15, q1 + vmin.f16 q15, q15, q2 + + StoreE1: + ldr r8, [r4, #20] + add r11, r11, r8 + ldr r8, [r4, #12] + vst1.16 {q15}, [r9], r8 + subs r5, r5, #1 + bne LoopE1H + subs r3, r3, #1 + add r0, r0, #16 + add r1, r1, #2 + pop {r7} + bne LoopE1 +End: +vpop {q4} +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/arm82/asm/arm32/MNNQuantizeFP16_UNIT4.S b/source/backend/arm82/asm/arm32/MNNQuantizeFP16_UNIT4.S new file mode 100644 index 00000000..6e7b2040 --- /dev/null +++ b/source/backend/arm82/asm/arm32/MNNQuantizeFP16_UNIT4.S @@ -0,0 +1,57 @@ +// +// MNNQuantizeFP16_UNIT4.S +// MNN +// +// Created by MNN on 2020/02/13. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +asm_function MNNQuantizeFP16_UNIT4 +// void MNNQuantizeFP16_UNIT4(FLOAT16* dst, const float* src, int size); + +// Auto: +// r0:dst, r1:src, r2:size + +push {lr} + +L4: +cmp r2, #4 +blt L1 + +Loop4: +// {q0-q3} => {d16-d19 (q8-q9)} +vld1.32 {q0, q1}, [r1]! +vcvt.f16.f32 d16, q0 +vld1.32 {q2, q3}, [r1]! +vcvt.f16.f32 d17, q1 +vcvt.f16.f32 d18, q2 +vst1.16 {d16, d17}, [r0]! +sub r2, r2, #4 +vcvt.f16.f32 d19, q3 +cmp r2, #4 +vst1.16 {d18, d19}, [r0]! +bge Loop4 + +L1: +cmp r2, #0 +beq End + +Loop1: +vld1.32 {q0}, [r1]! +vcvt.f16.f32 d2, q0 +vst1.16 {d2}, [r0]! +subs r2, r2, #1 +bne Loop1 + +End: +pop {pc} + +#endif +#endif diff --git a/source/backend/arm82/asm/arm64/Arm82MNNPackForMatMul_A.S b/source/backend/arm82/asm/arm64/Arm82MNNPackForMatMul_A.S new file mode 100644 index 00000000..b380002c --- /dev/null +++ b/source/backend/arm82/asm/arm64/Arm82MNNPackForMatMul_A.S @@ -0,0 +1,508 @@ +// +// Arm82MNNPackForMatMul_A.S +// MNN +// +// Created by MNN on 2020/06/10. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" +// (l/8,e,8) -> (e/12,l,12) +// trans 8x12 == trans 8x8 + trans 4x4 + trans 4x4 + +.text +.align 5 +asm_function Arm82MNNPackForMatMul_A +//void Arm82MNNPackForMatMul_A(FLOAT16* destOrigin, FLOAT16 const** sourceGroup, const int32_t* info, const int32_t* el) +//Auto: x0: dest, x1:sourceGroup, x2: info, x3:el +sub sp, sp, #128 +st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 +st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + +ldr w10, [x2, #0] // number +mov x4, #0 +mov x11, #0 +mov x6, #0 +ldr w4, [x2, #4] // eReal +ldr w11, [x2, #8] // eDest +ldr w6, [x2, #12] // xOffset +// xOffset -> xOffset * 8 * sizeof(FLOAT16) +// eReal -> eReal * 8 * sizeof(FLOAT16) +// eDest -> eDest * sizeof(FLOAT16) +mov x9, #2 // sizeof(FLOAT16) +mov x12, #16 // 8 * sizeof(FLOAT16) +mul x4, x12, x4 +mul x11, x9, x11 +mul x6, x12, x6 + +LoopNumber: +mov x5, #0 +mov x8, #0 +mov x7, #0 +ldr w5, [x3, #4] // l +ldr w8, [x3, #8] // eOffset +ldr w7, [x3, #12] // lOffset + +mov x13, x0 +mov x14, x1 +ldr x1, [x1, #0] + +// Compute dest ptr: x0 = x0 + eOffset * sizeof(FLOAT16) + lOffset * eDest * sizeof(FLOAT16) +mov x9, #2 // sizeof(FLOAT16) +mul x7, x11, x7 +mul x8, x9, x8 +add x0, x0, x7 +add x0, x0, x8 +mov x2, #0 +ldr w2, [x3, #0] // e + +Body: + cmp w2, #12 // eP + blt E8 + cmp x5, #8 + blt Body_LoopLExtra + Body_LoopL8: + mov x2, x1 + +.macro TRANSPOSE_8x8 d0, d1, d2, d3, d4, d5, d6, d7, t0, t1, t2, t3, t4, t5, t6, t7 + zip1 \t0\().8h, v0.8h, v1.8h + zip2 \t1\().8h, v0.8h, v1.8h + zip1 \t2\().8h, v2.8h, v3.8h + zip2 \t3\().8h, v2.8h, v3.8h + zip1 \t4\().8h, v4.8h, v5.8h + zip2 \t5\().8h, v4.8h, v5.8h + zip1 \t6\().8h, v6.8h, v7.8h + zip2 \t7\().8h, v6.8h, v7.8h + zip1 v0.4s, \t0\().4s, \t2\().4s + zip2 v1.4s, \t0\().4s, \t2\().4s + zip1 v2.4s, \t1\().4s, \t3\().4s + zip2 v3.4s, \t1\().4s, \t3\().4s + zip1 v4.4s, \t4\().4s, \t6\().4s + zip2 v5.4s, \t4\().4s, \t6\().4s + zip1 v6.4s, \t5\().4s, \t7\().4s + zip2 v7.4s, \t5\().4s, \t7\().4s + zip1 \d0\().2d, v0.2d, v4.2d + zip2 \d1\().2d, v0.2d, v4.2d + zip1 \d2\().2d, v1.2d, v5.2d + zip2 \d3\().2d, v1.2d, v5.2d + zip1 \d4\().2d, v2.2d, v6.2d + zip2 \d5\().2d, v2.2d, v6.2d + zip1 \d6\().2d, v3.2d, v7.2d + zip2 \d7\().2d, v3.2d, v7.2d +.endm + +.macro TRANSPOSE_8x4 s0, s1, s2, s3, d0, d1, d2, d3, t0, t1, t2, t3 + zip1 \t0\().8h, \s0\().8h, \s1\().8h + zip2 \t1\().8h, \s0\().8h, \s1\().8h + zip1 \t2\().8h, \s2\().8h, \s3\().8h + zip2 \t3\().8h, \s2\().8h, \s3\().8h + zip1 \d0\().4s, \t0\().4s, \t2\().4s + zip2 \d1\().4s, \t0\().4s, \t2\().4s + zip1 \d2\().4s, \t1\().4s, \t3\().4s + zip2 \d3\().4s, \t1\().4s, \t3\().4s +.endm + +.macro MAIN_TRANSPOSE_E12 +// src:[v0-v11] + ld1 {v0.8h}, [x1], x6 + ld1 {v1.8h}, [x1], x6 + ld1 {v2.8h}, [x1], x6 + ld1 {v3.8h}, [x1], x6 + ld1 {v4.8h}, [x1], x6 + ld1 {v5.8h}, [x1], x6 + ld1 {v6.8h}, [x1], x6 + ld1 {v7.8h}, [x1], x6 + ld1 {v8.8h}, [x1], x6 + ld1 {v9.8h}, [x1], x6 + ld1 {v10.8h}, [x1], x6 + ld1 {v11.8h}, [x1], x6 +// [v0, v1, v2, v3, v4, v5, v6, v7] => [v20, v12, v23, v13, v26, v14, v29, v15] +// tmp: [21, 22, 24, 25, 27, 28, 30, 31] + TRANSPOSE_8x8 v20, v12, v23, v13, v26, v14, v29, v15, v21, v22, v24, v25, v27, v28, v30, v31 +// [v8, v9, v10, v11] => [v16, v17, v18, v19] +// tmp can be used: [0, 1, 2, 3, 4, 5, 6, 7, 21, 22, 24, 25, 27, 28, 30, 31] + TRANSPOSE_8x4 v8, v9, v10, v11, v16, v17, v18, v19, v0, v1, v2, v3 +// merge: [(v12, v16), (v13, v17), (v14, v18), (v15, v19)] => [(v21, v22), (v24, v25), (v27, v28), (v30, v31)] + trn1 v21.2d, v16.2d, v12.2d + trn2 v22.2d, v12.2d, v16.2d + trn1 v24.2d, v17.2d, v13.2d + trn2 v25.2d, v13.2d, v17.2d + trn1 v27.2d, v18.2d, v14.2d + trn2 v28.2d, v14.2d, v18.2d + trn1 v30.2d, v19.2d, v15.2d + trn2 v31.2d, v15.2d, v19.2d +// dst:[v20-v31] +.endm + MAIN_TRANSPOSE_E12 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64 + + add x1, x2, x4 + sub x5, x5, #8 + cmp x5, #8 + bge Body_LoopL8 + + cbz x5, Body_LoopLEnd + Body_LoopLExtra: + MAIN_TRANSPOSE_E12 + cmp x5, #7 // if x5 < 7 + blt Body_LoopLEx6 // jump to Body_LoopLEx6 + Body_LoopLEx7: + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 + st1 {v28.8h, v29.8h}, [x0], #32 + st1 {v30.4h}, [x0], #8 + b Body_LoopLEnd + Body_LoopLEx6: + cmp x5, #6 // if x5 < 6 + blt Body_LoopLEx5 // jump to Body_LoopLEx5 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 + st1 {v28.8h}, [x0], #16 + b Body_LoopLEnd + Body_LoopLEx5: + cmp x5, #5 // if x5 < 5 + blt Body_LoopLEx4 // jump to Body_LoopLEx4 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h}, [x0], #48 + st1 {v27.4h}, [x0], #8 + b Body_LoopLEnd + Body_LoopLEx4: + cmp x5, #4 // if x5 < 4 + blt Body_LoopLEx3 // jump to Body_LoopLEx3 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h}, [x0], #32 + b Body_LoopLEnd + Body_LoopLEx3: + cmp x5, #3 // if x5 < 3 + blt Body_LoopLEx2 // jump to Body_LoopLEx2 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.4h}, [x0], #8 + b Body_LoopLEnd + Body_LoopLEx2: + cmp x5, #2 // if x5 < 2 + blt Body_LoopLEx1 // jump to Body_LoopLEx1 + st1 {v20.8h, v21.8h, v22.8h}, [x0], #48 + b Body_LoopLEnd + Body_LoopLEx1: + cmp x5, #1 // if x5 < 1 + blt Body_LoopLEnd + st1 {v20.8h}, [x0], #16 + st1 {v21.4h}, [x0], #8 + Body_LoopLEnd: + b End + +E8: + cmp w2, #8 + blt E4 + + mov x9, x5 + mov x7, x1 + mov x8, x0 + cmp x5, #8 + blt E8_LoopLExtra + E8_LoopL8: + mov x12, x1 + .macro MAIN_TRANSPOSE_E8 + // src:[v0-v7] + ld1 {v0.8h}, [x1], x6 + ld1 {v1.8h}, [x1], x6 + ld1 {v2.8h}, [x1], x6 + ld1 {v3.8h}, [x1], x6 + ld1 {v4.8h}, [x1], x6 + ld1 {v5.8h}, [x1], x6 + ld1 {v6.8h}, [x1], x6 + ld1 {v7.8h}, [x1], x6 + TRANSPOSE_8x8 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23 + .endm + + MAIN_TRANSPOSE_E8 + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + st1 {v10.8h}, [x0], x11 + st1 {v11.8h}, [x0], x11 + st1 {v12.8h}, [x0], x11 + st1 {v13.8h}, [x0], x11 + st1 {v14.8h}, [x0], x11 + st1 {v15.8h}, [x0], x11 + + add x1, x12, x4 + sub x5, x5, #8 + cmp x5, #8 + bge E8_LoopL8 + + cbz x5, E8_LoopLEnd + E8_LoopLExtra: + MAIN_TRANSPOSE_E8 + cmp x5, #7 // if x5 < 7 + blt E8_LoopLEx6 // jump to E8_LoopLEx6 + E8_LoopLEx7: + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + st1 {v10.8h}, [x0], x11 + st1 {v11.8h}, [x0], x11 + st1 {v12.8h}, [x0], x11 + st1 {v13.8h}, [x0], x11 + st1 {v14.8h}, [x0], x11 + b E8_LoopLEnd + E8_LoopLEx6: + cmp x5, #6 // if x5 < 6 + blt E8_LoopLEx5 // jump to E8_LoopLEx5 + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + st1 {v10.8h}, [x0], x11 + st1 {v11.8h}, [x0], x11 + st1 {v12.8h}, [x0], x11 + st1 {v13.8h}, [x0], x11 + b E8_LoopLEnd + E8_LoopLEx5: + cmp x5, #5 // if x5 < 5 + blt E8_LoopLEx4 // jump to E8_LoopLEx4 + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + st1 {v10.8h}, [x0], x11 + st1 {v11.8h}, [x0], x11 + st1 {v12.8h}, [x0], x11 + b E8_LoopLEnd + E8_LoopLEx4: + cmp x5, #4 // if x5 < 4 + blt E8_LoopLEx3 // jump to E8_LoopLEx3 + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + st1 {v10.8h}, [x0], x11 + st1 {v11.8h}, [x0], x11 + b E8_LoopLEnd + E8_LoopLEx3: + cmp x5, #3 // if x5 < 3 + blt E8_LoopLEx2 // jump to E8_LoopLEx2 + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + st1 {v10.8h}, [x0], x11 + b E8_LoopLEnd + E8_LoopLEx2: + cmp x5, #2 // if x5 < 2 + blt E8_LoopLEx1 // jump to E8_LoopLEx1 + st1 {v8.8h}, [x0], x11 + st1 {v9.8h}, [x0], x11 + b E8_LoopLEnd + E8_LoopLEx1: + cmp x5, #1 // if x5 < 1 + blt E8_LoopLEnd + st1 {v8.8h}, [x0], x11 + E8_LoopLEnd: + sub w2, w2, #8 + add x0, x8, #16 // 8 * sizeof(FLOAT16) + add x1, x7, x6, LSL #3 + mov w5, w9 + cbz w2, End + +E4: + cmp w2, #4 + blt E1 + + mov x9, x5 + mov x7, x1 + mov x8, x0 + cmp x5, #8 + blt E4_LoopLExtra + E4_LoopL8: + mov x12, x1 + .macro MAIN_TRANSPOSE_E4 + // src:[v0-v7] + ld1 {v0.8h}, [x1], x6 + ld1 {v1.8h}, [x1], x6 + ld1 {v2.8h}, [x1], x6 + ld1 {v3.8h}, [x1], x6 + TRANSPOSE_8x4 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 + .endm + + MAIN_TRANSPOSE_E4 + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + st1 {v5.d}[0], [x0], x11 + st1 {v5.d}[1], [x0], x11 + st1 {v6.d}[0], [x0], x11 + st1 {v6.d}[1], [x0], x11 + st1 {v7.d}[0], [x0], x11 + st1 {v7.d}[1], [x0], x11 + + add x1, x12, x4 + sub x5, x5, #8 + cmp x5, #8 + bge E4_LoopL8 + + cbz x5, E4_LoopLEnd + E4_LoopLExtra: + MAIN_TRANSPOSE_E4 + cmp x5, #7 // if x5 < 7 + blt E4_LoopLEx6 // jump to E4_LoopLEx6 + E4_LoopLEx7: + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + st1 {v5.d}[0], [x0], x11 + st1 {v5.d}[1], [x0], x11 + st1 {v6.d}[0], [x0], x11 + st1 {v6.d}[1], [x0], x11 + st1 {v7.d}[0], [x0], x11 + b E4_LoopLEnd + E4_LoopLEx6: + cmp x5, #6 // if x5 < 6 + blt E4_LoopLEx5 // jump to E4_LoopLEx5 + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + st1 {v5.d}[0], [x0], x11 + st1 {v5.d}[1], [x0], x11 + st1 {v6.d}[0], [x0], x11 + st1 {v6.d}[1], [x0], x11 + b E4_LoopLEnd + E4_LoopLEx5: + cmp x5, #5 // if x5 < 5 + blt E4_LoopLEx4 // jump to E4_LoopLEx4 + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + st1 {v5.d}[0], [x0], x11 + st1 {v5.d}[1], [x0], x11 + st1 {v6.d}[0], [x0], x11 + b E4_LoopLEnd + E4_LoopLEx4: + cmp x5, #4 // if x5 < 4 + blt E4_LoopLEx3 // jump to E4_LoopLEx3 + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + st1 {v5.d}[0], [x0], x11 + st1 {v5.d}[1], [x0], x11 + b E4_LoopLEnd + E4_LoopLEx3: + cmp x5, #3 // if x5 < 3 + blt E4_LoopLEx2 // jump to E4_LoopLEx2 + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + st1 {v5.d}[0], [x0], x11 + b E4_LoopLEnd + E4_LoopLEx2: + cmp x5, #2 // if x5 < 2 + blt E4_LoopLEx1 // jump to E4_LoopLEx1 + st1 {v4.d}[0], [x0], x11 + st1 {v4.d}[1], [x0], x11 + b E4_LoopLEnd + E4_LoopLEx1: + cmp x5, #1 // if x5 < 1 + blt E4_LoopLEnd + st1 {v4.d}[0], [x0], x11 + E4_LoopLEnd: + sub w2, w2, #4 + add x0, x8, #8 // 4 * sizeof(FLOAT16) + add x1, x7, x6, LSL #2 + mov w5, w9 + cbz w2, End + +E1: +LoopE1: + mov x9, x5 + mov x7, x1 + mov x8, x0 + cmp x5, #8 + blt E1_LoopLEx7 + + E1_LoopL8: + ld1 {v0.8h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + st1 {v0.h}[3], [x0], x11 + st1 {v0.h}[4], [x0], x11 + st1 {v0.h}[5], [x0], x11 + st1 {v0.h}[6], [x0], x11 + st1 {v0.h}[7], [x0], x11 + sub x5, x5, #8 + cmp x5, #8 + bge E1_LoopL8 + + E1_LoopLEx7: + cmp x5, #7 // if x5 < 7 + blt E1_LoopLEx6 // jump to E1_LoopLEx6 + ld1 {v0.8h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + st1 {v0.h}[3], [x0], x11 + st1 {v0.h}[4], [x0], x11 + st1 {v0.h}[5], [x0], x11 + st1 {v0.h}[6], [x0], x11 + b E1_LoopLEnd + E1_LoopLEx6: + cmp x5, #6 // if x5 < 6 + blt E1_LoopLEx5 // jump to E1_LoopLEx5 + ld1 {v0.8h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + st1 {v0.h}[3], [x0], x11 + st1 {v0.h}[4], [x0], x11 + st1 {v0.h}[5], [x0], x11 + b E1_LoopLEnd + E1_LoopLEx5: + cmp x5, #5 // if x5 < 5 + blt E1_LoopLEx4 // jump to E1_LoopLEx4 + ld1 {v0.8h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + st1 {v0.h}[3], [x0], x11 + st1 {v0.h}[4], [x0], x11 + b E1_LoopLEnd + E1_LoopLEx4: + cmp x5, #4 // if x5 < 4 + blt E1_LoopLEx3 // jump to E1_LoopLEx3 + ld1 {v0.d}[0], [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + st1 {v0.h}[3], [x0], x11 + b E1_LoopLEnd + E1_LoopLEx3: + cmp x5, #3 // if x5 < 3 + blt E1_LoopLEx2 // jump to E1_LoopLEx2 + ld1 {v0.d}[0], [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + b E1_LoopLEnd + E1_LoopLEx2: + cmp x5, #2 // if x5 < 2 + blt E1_LoopLEx1 // jump to E1_LoopLEx1 + ld1 {v0.s}[0], [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + b E1_LoopLEnd + E1_LoopLEx1: + cmp x5, #1 // if x5 < 1 + blt E1_LoopLEnd + ld1 {v0.h}[0], [x1], x4 + st1 {v0.h}[0], [x0], x11 + E1_LoopLEnd: + subs w2, w2, #1 + add x0, x8, #2 // sizeof(FLOAT16) + add x1, x7, x6 + mov w5, w9 + bne LoopE1 + +End: + +mov x0, x13 +mov x1, x14 +subs w10, w10, #1 +add x3, x3, #16 // 4 * sizeof(int32_t) +add x1, x1, #8 // sizeof(FLOAT16*) + +bne LoopNumber + +sub sp, sp, #128 +ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 +ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + +ret + +#endif diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S new file mode 100644 index 00000000..73286a88 --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S @@ -0,0 +1,91 @@ +// +// MNNConvDwF23MulTransUnitFP16.S +// MNN +// +// Created by MNN on 2019/4/4. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNConvDwF23MulTransUnitFP16 +//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow); +//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow +ldr x4, [x0, #0] +ldr x5, [x0, #8] +ldr x6, [x0, #16] + +ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64 +ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64 +ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1] + +L2: +cmp x3, #2 +blt L1 + +LoopL2: + +ld1 {v20.8h, v21.8h}, [x4], #32 +fmul v0.8h, v4.8h, v20.8h +ld1 {v22.8h, v23.8h}, [x4], #32 +fmul v1.8h, v5.8h, v21.8h +fmul v2.8h, v6.8h, v22.8h +ld1 {v20.8h, v21.8h}, [x5], #32 +fmul v3.8h, v7.8h, v23.8h + +fmla v0.8h, v16.8h, v20.8h +ld1 {v22.8h, v23.8h}, [x5], #32 +fmla v1.8h, v17.8h, v21.8h +fmla v2.8h, v18.8h, v22.8h +fmla v3.8h, v19.8h, v23.8h + +ld1 {v20.8h, v21.8h}, [x6], #32 +fmla v0.8h, v28.8h, v20.8h +fmla v1.8h, v29.8h, v21.8h +fadd v0.8h, v1.8h, v0.8h +ld1 {v22.8h, v23.8h}, [x6], #32 + +fmla v2.8h, v30.8h, v22.8h +fmla v3.8h, v31.8h, v23.8h +fadd v0.8h, v0.8h, v2.8h + +fadd v3.8h, v3.8h, v1.8h +fsub v1.8h, v3.8h, v2.8h + +st1 {v0.8h, v1.8h}, [x2], #32 + +sub x3, x3, #2 +cmp x3, #2 +bge LoopL2 + + +L1: +cmp x3, #0 +beq End +ld1 {v20.8h, v21.8h, v22.8h}, [x4] +fmul v0.8h, v4.8h, v20.8h +fmul v1.8h, v5.8h, v21.8h +fmul v2.8h, v6.8h, v22.8h +ld1 {v20.8h, v21.8h, v22.8h}, [x5] + +fmla v0.8h, v16.8h, v20.8h +fmla v1.8h, v17.8h, v21.8h +fmla v2.8h, v18.8h, v22.8h + +ld1 {v20.8h, v21.8h, v22.8h}, [x6] +fmla v0.8h, v28.8h, v20.8h +fmla v1.8h, v29.8h, v21.8h +fadd v0.8h, v1.8h, v0.8h + +fmla v2.8h, v30.8h, v22.8h +fadd v0.8h, v0.8h, v2.8h + +st1 {v0.8h}, [x2] +End: + +ret +#endif diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S new file mode 100644 index 00000000..cac31e53 --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S @@ -0,0 +1,56 @@ +// +// MNNConvDwF23SourceTransUnitFP16.S +// MNN +// +// Created by MNN on 2019/4/4. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNConvDwF23SourceTransUnitFP16 +// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit); + +//Auto: +//x0: source, x1:dest, x2:unit + +L1: +cmp x2, #0 +beq End + +ld1 {v16.8h, v17.8h}, [x0], #32 +ld1 {v18.8h, v19.8h}, [x0], #32 +subs x2, x2, #1 +fsub v0.8h, v16.8h, v18.8h +fadd v1.8h, v17.8h, v18.8h +beq L1LoopEnd + +L1Loop: + fsub v2.8h, v18.8h, v17.8h + st1 {v0.8h, v1.8h}, [x1], #32 + fsub v3.8h, v19.8h, v17.8h + mov v16.16b, v18.16b + st1 {v2.8h, v3.8h}, [x1], #32 + mov v17.16b, v19.16b + ld1 {v18.8h, v19.8h}, [x0], #32 + fsub v0.8h, v16.8h, v18.8h + fadd v1.8h, v17.8h, v18.8h + + subs x2, x2, #1 + bne L1Loop +L1LoopEnd: +fsub v2.8h, v18.8h, v17.8h +fsub v3.8h, v19.8h, v17.8h + +st1 {v0.8h, v1.8h}, [x1], #32 +st1 {v2.8h, v3.8h}, [x1], #32 + + +End: +ret + +#endif diff --git a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S new file mode 100644 index 00000000..1cb449d2 --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S @@ -0,0 +1,263 @@ +// +// MNNConvRunForLineDepthwiseFP16.S +// MNN +// +// Created by MNN on 2019/02/04. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNConvRunForLineDepthwiseFP16 +//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) + +//Auto Load: +//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step + +//Load From sp: +//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep +ldr x8, [sp, #0] +ldr x15, [sp, #8] +ldr x10, [sp, #16] +ldr x11, [sp, #24] + +mov x9, #2 // sizeof(FLOAT16) +mul x4, x9, x4 +mul x7, x9, x7 +mul x8, x9, x8 +mul x10, x9, x10 +mul x11, x9, x11 + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul x9, x5, x7 +sub x8, x8, x9 + +.macro zero_vec x0, x1, x2, x3 + movi \x0\().8h, #0 + movi \x1\().8h, #0 + movi \x2\().8h, #0 + movi \x3\().8h, #0 +.endm + +LoopDY: +mov v4.d[0], x10 +mov v4.d[1], x11 +mov v5.d[0], x0 +mov v5.d[1], x1 +mov v6.d[0], x3 + +L16: +cmp x3, #16 +blt L8 + +mov x12, #16 +mul x12, x4, x12 + +L16Loop: + zero_vec v16, v17, v18, v19 + zero_vec v20, v21, v22, v23 + zero_vec v24, v25, v26, v27 + zero_vec v28, v29, v30, v31 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L16LoopH: + mov x10, x5 + L16LoopW: + ld1 {v7.8h}, [x2], #16 + ld1 {v0.8h}, [x1], x4 + subs x10, x10, #1 + ld1 {v1.8h}, [x1], x4 + fmla v16.8h, v7.8h, v0.8h + fmla v17.8h, v7.8h, v1.8h + ld1 {v2.8h}, [x1], x4 + ld1 {v3.8h}, [x1], x4 + fmla v18.8h, v7.8h, v2.8h + fmla v19.8h, v7.8h, v3.8h + ld1 {v0.8h}, [x1], x4 + ld1 {v1.8h}, [x1], x4 + fmla v20.8h, v7.8h, v0.8h + fmla v21.8h, v7.8h, v1.8h + ld1 {v2.8h}, [x1], x4 + ld1 {v3.8h}, [x1], x4 + fmla v22.8h, v7.8h, v2.8h + fmla v23.8h, v7.8h, v3.8h + + ld1 {v0.8h}, [x1], x4 + ld1 {v1.8h}, [x1], x4 + fmla v24.8h, v7.8h, v0.8h + fmla v25.8h, v7.8h, v1.8h + ld1 {v2.8h}, [x1], x4 + ld1 {v3.8h}, [x1], x4 + fmla v26.8h, v7.8h, v2.8h + fmla v27.8h, v7.8h, v3.8h + ld1 {v0.8h}, [x1], x4 + ld1 {v1.8h}, [x1], x4 + fmla v28.8h, v7.8h, v0.8h + fmla v29.8h, v7.8h, v1.8h + ld1 {v2.8h}, [x1], x4 + ld1 {v3.8h}, [x1], x4 + fmla v30.8h, v7.8h, v2.8h + fmla v31.8h, v7.8h, v3.8h + sub x1, x1, x12 + add x1, x1, x7 + + bne L16LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L16LoopH + + sub x3, x3, #16 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + add x1, x13, x12 + cmp x3, #16 + mov x2, x14 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64 + bge L16Loop + + +L8: +cmp x3, #7 +ble L4 + +mov x12, #8 +mul x12, x4, x12 + +L8Loop: + zero_vec v16, v17, v18, v19 + zero_vec v20, v21, v22, v23 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L8LoopH: + mov x10, x5 + L8LoopW: + ld1 {v3.8h}, [x2], #16 + ld1 {v0.8h}, [x1], x4 + subs x10, x10, #1 + fmla v16.8h, v3.8h, v0.8h + ld1 {v1.8h}, [x1], x4 + fmla v17.8h, v3.8h, v1.8h + ld1 {v0.8h}, [x1], x4 + fmla v18.8h, v0.8h, v3.8h + ld1 {v1.8h}, [x1], x4 + fmla v19.8h, v1.8h, v3.8h + ld1 {v0.8h}, [x1], x4 + fmla v20.8h, v0.8h, v3.8h + ld1 {v1.8h}, [x1], x4 + fmla v21.8h, v1.8h, v3.8h + ld1 {v0.8h}, [x1], x4 + fmla v22.8h, v0.8h, v3.8h + ld1 {v1.8h}, [x1], x4 + fmla v23.8h, v1.8h, v3.8h + + sub x1, x1, x12 + add x1, x1, x7 + + bne L8LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L8LoopH + + sub x3, x3, #8 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + add x1, x13, x12 + mov x2, x14 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + + +L4: +cmp x3, #4 +ble L1 + +mov x12, #4 +mul x12, x4, x12 + +L4Loop: + zero_vec v16, v17, v18, v19 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L4LoopH: + mov x10, x5 + L4LoopW: + ld1 {v3.8h}, [x2], #16 + ld1 {v0.8h}, [x1], x4 + subs x10, x10, #1 + fmla v16.8h, v3.8h, v0.8h + ld1 {v1.8h}, [x1], x4 + fmla v17.8h, v3.8h, v1.8h + ld1 {v0.8h}, [x1], x4 + fmla v18.8h, v0.8h, v3.8h + ld1 {v1.8h}, [x1], x4 + fmla v19.8h, v1.8h, v3.8h + + sub x1, x1, x12 + add x1, x1, x7 + + bne L4LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L4LoopH + + sub x3, x3, #4 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + add x1, x13, x12 + mov x2, x14 + +L1: +cmp x3, #0 +beq End + +L1Loop: + movi v0.8h, #0 + mov x9, x6 + mov x11, x1 + mov x12, x2 + L1LoopH: + mov x10, x5 + L1LoopW: + ld1 {v1.8h}, [x1], x7 + ld1 {v2.8h}, [x2], #16 + fmla v0.8h, v1.8h, v2.8h + subs x10, x10, #1 + bne L1LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L1LoopH + + subs x3, x3, #1 + st1 {v0.8h}, [x0], #16 + mov x2, x12 + add x1, x11, x4 + bne L1Loop + + +End: + +mov x10, v4.d[0] +mov x11, v4.d[1] +mov x0, v5.d[0] +mov x1, v5.d[1] +mov x3, v6.d[0] + +subs x15, x15, #1 +add x0, x0, x11 +add x1, x1, x10 +bne LoopDY + + +ret + +#endif diff --git a/source/backend/arm82/asm/arm64/MNNExpFP16.S b/source/backend/arm82/asm/arm64/MNNExpFP16.S new file mode 100644 index 00000000..c8ca240b --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNExpFP16.S @@ -0,0 +1,80 @@ +// +// MNNExpFP16.S +// MNN +// +// Created by MNN on 2019/01/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 + +//void MNNExpFP16(FLOAT16* dest, const FLOAT16* source, const FLOAT16* parameters, size_t block) +asm_function MNNExpFP16 + +//x0: dest, x1:source, x2:parameters, x3:block + +ld1 {v0.8h}, [x2] +movi v2.8h, #10 +movi v3.8h, #11 +scvtf v3.8h, v3.8h +fneg v4.8h, v3.8h + +Loop: + +ld1 {v16.8h, v17.8h}, [x1], #32 + +fmin v16.8h, v16.8h, v3.8h +fmin v17.8h, v17.8h, v3.8h +fmax v16.8h, v16.8h, v4.8h +fmax v17.8h, v17.8h, v4.8h + +fneg v18.8h, v16.8h +fneg v19.8h, v17.8h + +fmul v16.8h, v18.8h, v0.h[1] +fmul v17.8h, v19.8h, v0.h[1] +fcvtzs v16.8h, v16.8h +fcvtzs v17.8h, v17.8h +scvtf v20.8h, v16.8h +scvtf v21.8h, v17.8h + +//v18.8h, v19.8h: t +fmls v18.8h, v20.8h, v0.h[0] +fmls v19.8h, v21.8h, v0.h[0] + +.macro MLA_TWO z0 z1 z2 z3 +dup \z1, \z0 +fmla \z1, \z2, \z3 +.endm + +MLA_TWO v0.h[6], v20.8h, v18.8h, v0.h[7] +MLA_TWO v0.h[6], v21.8h, v19.8h, v0.h[7] +MLA_TWO v0.h[5], v22.8h, v18.8h, v20.8h +MLA_TWO v0.h[5], v23.8h, v19.8h, v21.8h +MLA_TWO v0.h[4], v20.8h, v18.8h, v22.8h +MLA_TWO v0.h[4], v21.8h, v19.8h, v23.8h +MLA_TWO v0.h[3], v22.8h, v18.8h, v20.8h +MLA_TWO v0.h[3], v23.8h, v19.8h, v21.8h +MLA_TWO v0.h[2], v20.8h, v18.8h, v22.8h +MLA_TWO v0.h[2], v21.8h, v19.8h, v23.8h + +//v20.8h, v21.8h is expRemain + +ushl v16.8h, v16.8h, v2.8h +ushl v17.8h, v17.8h, v2.8h +add v20.8h, v20.8h, v16.8h +add v21.8h, v21.8h, v17.8h + +st1 {v20.8h, v21.8h}, [x0], #32 + +subs x3, x3, #1 +bne Loop + +ret + +#endif + diff --git a/source/backend/arm82/asm/arm64/MNNGemmFP16C8_UNIT.S b/source/backend/arm82/asm/arm64/MNNGemmFP16C8_UNIT.S deleted file mode 100644 index 5ffcfb1e..00000000 --- a/source/backend/arm82/asm/arm64/MNNGemmFP16C8_UNIT.S +++ /dev/null @@ -1,437 +0,0 @@ -// -// MNNGemmFP16C8_UNIT.S -// MNN -// -// Created by MNN on 2020/01/14. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ -#include "MNNAsmGlobal.h" - -.text -.align 5 -asm_function MNNGemmFP16C8_UNIT -// void MNNGemmFP16C8_UNIT(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, -// const FLOAT16* bias, size_t src_loop, size_t dst_step, size_t dst_loop, size_t relu, -// size_t relu6, size_t realDstCount) - -// Auto: -// x0:dst, x1:src, x2:weight, x3:bias, x4:src_loop -// x5:dst_step, x6:dst_loop, x7:relu -// load from sp: -// x8:relu6, x9:realDstCount - -ldr x8, [sp, #0] -ldr x9, [sp, #8] - -mov x10, #8 -mul x4, x4, x10 // x4 * 8 = (inputChannelUnit * kernelCount) * 8 - -cmp x9, #4 -ble TILE_4 - -TILE_8: -cmp x6, #2 -blt LoopDz_TILE_8_ONE_OC - -LoopDz_TILE_8_DOUBLE_OC: - ldr q6, [x3], #16 // bias - mov x11, x1 - mov x12, x4 - ldr q7, [x3], #16 // bias + 8 - - mov v16.16b, v6.16b - mov v17.16b, v6.16b - mov v18.16b, v6.16b - mov v19.16b, v6.16b - mov v20.16b, v6.16b - mov v21.16b, v6.16b - mov v22.16b, v6.16b - mov v23.16b, v6.16b - mov v24.16b, v7.16b - mov v25.16b, v7.16b - mov v26.16b, v7.16b - mov v27.16b, v7.16b - mov v28.16b, v7.16b - mov v29.16b, v7.16b - mov v30.16b, v7.16b - mov v31.16b, v7.16b - - LoopSz_TILE_8_DOUBLE_OC: - ldr q0, [x2] // weight - ldr q4, [x11] // input - fmla v16.8h, v0.8h, v4.h[0] - fmla v17.8h, v0.8h, v4.h[1] - fmla v18.8h, v0.8h, v4.h[2] - fmla v19.8h, v0.8h, v4.h[3] - ldr q1, [x2, #16] // weight - fmla v20.8h, v0.8h, v4.h[4] - fmla v21.8h, v0.8h, v4.h[5] - fmla v22.8h, v0.8h, v4.h[6] - fmla v23.8h, v0.8h, v4.h[7] - - ldr q2, [x2, #32] // weight - fmla v24.8h, v1.8h, v4.h[0] - fmla v25.8h, v1.8h, v4.h[1] - fmla v26.8h, v1.8h, v4.h[2] - fmla v27.8h, v1.8h, v4.h[3] - ldr q5, [x11, #16] // input - fmla v28.8h, v1.8h, v4.h[4] - fmla v29.8h, v1.8h, v4.h[5] - fmla v30.8h, v1.8h, v4.h[6] - fmla v31.8h, v1.8h, v4.h[7] - - fmla v16.8h, v2.8h, v5.h[0] - fmla v17.8h, v2.8h, v5.h[1] - ldr q3, [x2, #48] // weight - fmla v18.8h, v2.8h, v5.h[2] - fmla v19.8h, v2.8h, v5.h[3] - add x11, x11, #32 - fmla v20.8h, v2.8h, v5.h[4] - fmla v21.8h, v2.8h, v5.h[5] - fmla v22.8h, v2.8h, v5.h[6] - fmla v23.8h, v2.8h, v5.h[7] - - fmla v24.8h, v3.8h, v5.h[0] - fmla v25.8h, v3.8h, v5.h[1] - subs x12, x12, #2 - fmla v26.8h, v3.8h, v5.h[2] - fmla v27.8h, v3.8h, v5.h[3] - add x2, x2, #64 - fmla v28.8h, v3.8h, v5.h[4] - fmla v29.8h, v3.8h, v5.h[5] - fmla v30.8h, v3.8h, v5.h[6] - fmla v31.8h, v3.8h, v5.h[7] - bne LoopSz_TILE_8_DOUBLE_OC - - cbz x7, RELU6_DOUBLE_OC - eor v0.16b, v0.16b, v0.16b - fmax v16.8h, v16.8h, v0.8h - fmax v17.8h, v17.8h, v0.8h - fmax v18.8h, v18.8h, v0.8h - fmax v19.8h, v19.8h, v0.8h - fmax v20.8h, v20.8h, v0.8h - fmax v21.8h, v21.8h, v0.8h - fmax v22.8h, v22.8h, v0.8h - fmax v23.8h, v23.8h, v0.8h - fmax v24.8h, v24.8h, v0.8h - fmax v25.8h, v25.8h, v0.8h - fmax v26.8h, v26.8h, v0.8h - fmax v27.8h, v27.8h, v0.8h - fmax v28.8h, v28.8h, v0.8h - fmax v29.8h, v29.8h, v0.8h - fmax v30.8h, v30.8h, v0.8h - fmax v31.8h, v31.8h, v0.8h - - RELU6_DOUBLE_OC: - cbz x8, STORE_TILE_8_DOUBLE_OC - eor v0.16b, v0.16b, v0.16b - movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) - fmax v16.8h, v16.8h, v0.8h - fmax v17.8h, v17.8h, v0.8h - fmax v18.8h, v18.8h, v0.8h - fmax v19.8h, v19.8h, v0.8h - fmax v20.8h, v20.8h, v0.8h - fmax v21.8h, v21.8h, v0.8h - fmax v22.8h, v22.8h, v0.8h - fmax v23.8h, v23.8h, v0.8h - fmax v24.8h, v24.8h, v0.8h - fmax v25.8h, v25.8h, v0.8h - fmax v26.8h, v26.8h, v0.8h - fmax v27.8h, v27.8h, v0.8h - fmax v28.8h, v28.8h, v0.8h - fmax v29.8h, v29.8h, v0.8h - fmax v30.8h, v30.8h, v0.8h - fmax v31.8h, v31.8h, v0.8h - fmin v16.8h, v16.8h, v1.8h - fmin v17.8h, v17.8h, v1.8h - fmin v18.8h, v18.8h, v1.8h - fmin v19.8h, v19.8h, v1.8h - fmin v20.8h, v20.8h, v1.8h - fmin v21.8h, v21.8h, v1.8h - fmin v22.8h, v22.8h, v1.8h - fmin v23.8h, v23.8h, v1.8h - fmin v24.8h, v24.8h, v1.8h - fmin v25.8h, v25.8h, v1.8h - fmin v26.8h, v26.8h, v1.8h - fmin v27.8h, v27.8h, v1.8h - fmin v28.8h, v28.8h, v1.8h - fmin v29.8h, v29.8h, v1.8h - fmin v30.8h, v30.8h, v1.8h - fmin v31.8h, v31.8h, v1.8h - - STORE_TILE_8_DOUBLE_OC: - str q16, [x0] - str q17, [x0, #16] - str q18, [x0, #32] - str q19, [x0, #48] - str q20, [x0, #64] - str q21, [x0, #80] - str q22, [x0, #96] - str q23, [x0, #112] - add x0, x0, x5 - str q24, [x0] - str q25, [x0, #16] - str q26, [x0, #32] - str q27, [x0, #48] - str q28, [x0, #64] - str q29, [x0, #80] - str q30, [x0, #96] - str q31, [x0, #112] - sub x6, x6, #2 - cmp x6, #2 - add x0, x0, x5 - BGE LoopDz_TILE_8_DOUBLE_OC - - -LoopDz_TILE_8_ONE_OC: -cmp x6, #0 -beq REAL_END - -ldr q6, [x3] // bias -mov x11, x1 -mov x12, x4 - -mov v24.16b, v6.16b -mov v25.16b, v6.16b -mov v26.16b, v6.16b -mov v27.16b, v6.16b -mov v28.16b, v6.16b -mov v29.16b, v6.16b -mov v30.16b, v6.16b -mov v31.16b, v6.16b - -LoopSz_TILE_8_ONE_OC: - ldr q0, [x2] // weight - ldr q4, [x11] // input - fmla v24.8h, v0.8h, v4.h[0] - fmla v25.8h, v0.8h, v4.h[1] - ldr q2, [x2, #16] // weight - fmla v26.8h, v0.8h, v4.h[2] - fmla v27.8h, v0.8h, v4.h[3] - ldr q5, [x11, #16] // input - fmla v28.8h, v0.8h, v4.h[4] - fmla v29.8h, v0.8h, v4.h[5] - fmla v30.8h, v0.8h, v4.h[6] - fmla v31.8h, v0.8h, v4.h[7] - - fmla v24.8h, v2.8h, v5.h[0] - fmla v25.8h, v2.8h, v5.h[1] - subs x12, x12, #2 - fmla v26.8h, v2.8h, v5.h[2] - fmla v27.8h, v2.8h, v5.h[3] - add x2, x2, #32 - fmla v28.8h, v2.8h, v5.h[4] - fmla v29.8h, v2.8h, v5.h[5] - add x11, x11, #32 - fmla v30.8h, v2.8h, v5.h[6] - fmla v31.8h, v2.8h, v5.h[7] - bne LoopSz_TILE_8_ONE_OC - -cbz x7, RELU6_ONE_OC -eor v0.16b, v0.16b, v0.16b -fmax v24.8h, v24.8h, v0.8h -fmax v25.8h, v25.8h, v0.8h -fmax v26.8h, v26.8h, v0.8h -fmax v27.8h, v27.8h, v0.8h -fmax v28.8h, v28.8h, v0.8h -fmax v29.8h, v29.8h, v0.8h -fmax v30.8h, v30.8h, v0.8h -fmax v31.8h, v31.8h, v0.8h - -RELU6_ONE_OC: -cbz x8, STORE_TILE_8_ONE_OC -eor v0.16b, v0.16b, v0.16b -movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) -fmax v24.8h, v24.8h, v0.8h -fmax v25.8h, v25.8h, v0.8h -fmax v26.8h, v26.8h, v0.8h -fmax v27.8h, v27.8h, v0.8h -fmax v28.8h, v28.8h, v0.8h -fmax v29.8h, v29.8h, v0.8h -fmax v30.8h, v30.8h, v0.8h -fmax v31.8h, v31.8h, v0.8h - -fmin v24.8h, v24.8h, v1.8h -fmin v25.8h, v25.8h, v1.8h -fmin v26.8h, v26.8h, v1.8h -fmin v27.8h, v27.8h, v1.8h -fmin v28.8h, v28.8h, v1.8h -fmin v29.8h, v29.8h, v1.8h -fmin v30.8h, v30.8h, v1.8h -fmin v31.8h, v31.8h, v1.8h - -STORE_TILE_8_ONE_OC: -str q24, [x0] -str q25, [x0, #16] -str q26, [x0, #32] -str q27, [x0, #48] -str q28, [x0, #64] -str q29, [x0, #80] -str q30, [x0, #96] -str q31, [x0, #112] - -b REAL_END - -# remain tile is (0, 4] -TILE_4: -cmp x6, #2 -blt LoopDz_TILE_4_ONE_OC - -LoopDz_TILE_4_DOUBLE_OC: - ldr q6, [x3], #16 // bias - mov x11, x1 - mov x12, x4 - ldr q7, [x3], #16 // bias + 8 - - mov v24.16b, v6.16b - mov v25.16b, v6.16b - mov v26.16b, v6.16b - mov v27.16b, v6.16b - - mov v28.16b, v7.16b - mov v29.16b, v7.16b - mov v30.16b, v7.16b - mov v31.16b, v7.16b - - LoopSz_TILE_4_DOUBLE_OC: - ldr q0, [x2] // weight - ldr d4, [x11] // input - fmla v24.8h, v0.8h, v4.h[0] - fmla v25.8h, v0.8h, v4.h[1] - ldr q1, [x2, #16] // weight - fmla v26.8h, v0.8h, v4.h[2] - fmla v27.8h, v0.8h, v4.h[3] - ldr d5, [x11, #8] // input - fmla v28.8h, v1.8h, v4.h[0] - fmla v29.8h, v1.8h, v4.h[1] - ldr q2, [x2, #32] // weight - fmla v30.8h, v1.8h, v4.h[2] - fmla v31.8h, v1.8h, v4.h[3] - - ldr q3, [x2, #48] // weight - fmla v24.8h, v2.8h, v5.h[0] - fmla v25.8h, v2.8h, v5.h[1] - subs x12, x12, #2 - fmla v26.8h, v2.8h, v5.h[2] - fmla v27.8h, v2.8h, v5.h[3] - add x2, x2, #64 - fmla v28.8h, v3.8h, v5.h[0] - fmla v29.8h, v3.8h, v5.h[1] - add x11, x11, #16 - fmla v30.8h, v3.8h, v5.h[2] - fmla v31.8h, v3.8h, v5.h[3] - bne LoopSz_TILE_4_DOUBLE_OC - - cbz x7, RELU6_TILE_4_DOUBLE_OC - eor v0.16b, v0.16b, v0.16b - fmax v24.8h, v24.8h, v0.8h - fmax v25.8h, v25.8h, v0.8h - fmax v26.8h, v26.8h, v0.8h - fmax v27.8h, v27.8h, v0.8h - fmax v28.8h, v28.8h, v0.8h - fmax v29.8h, v29.8h, v0.8h - fmax v30.8h, v30.8h, v0.8h - fmax v31.8h, v31.8h, v0.8h - - RELU6_TILE_4_DOUBLE_OC: - cbz x8, STORE_TILE_4_DOUBLE_OC - eor v0.16b, v0.16b, v0.16b - movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) - fmax v24.8h, v24.8h, v0.8h - fmax v25.8h, v25.8h, v0.8h - fmax v26.8h, v26.8h, v0.8h - fmax v27.8h, v27.8h, v0.8h - fmax v28.8h, v28.8h, v0.8h - fmax v29.8h, v29.8h, v0.8h - fmax v30.8h, v30.8h, v0.8h - fmax v31.8h, v31.8h, v0.8h - fmin v24.8h, v24.8h, v1.8h - fmin v25.8h, v25.8h, v1.8h - fmin v26.8h, v26.8h, v1.8h - fmin v27.8h, v27.8h, v1.8h - fmin v28.8h, v28.8h, v1.8h - fmin v29.8h, v29.8h, v1.8h - fmin v30.8h, v30.8h, v1.8h - fmin v31.8h, v31.8h, v1.8h - - STORE_TILE_4_DOUBLE_OC: - str q24, [x0] - str q25, [x0, #16] - str q26, [x0, #32] - str q27, [x0, #48] - add x0, x0, x5 - sub x6, x6, #2 - str q28, [x0] - str q29, [x0, #16] - str q30, [x0, #32] - str q31, [x0, #48] - cmp x6, #2 - add x0, x0, x5 - BGE LoopDz_TILE_4_DOUBLE_OC - - -LoopDz_TILE_4_ONE_OC: -cmp x6, #0 -beq REAL_END - -ldr q6, [x3] // bias -mov x11, x1 -mov x12, x4 - -mov v28.16b, v6.16b -mov v29.16b, v6.16b -mov v30.16b, v6.16b -mov v31.16b, v6.16b - -LoopSz_TILE_4_ONE_OC: - ldr q0, [x2] // weight - ldr d4, [x11] // input - ldr q2, [x2, #16] // weight - ldr d5, [x11, #8] // input - fmla v28.8h, v0.8h, v4.h[0] - fmla v29.8h, v0.8h, v4.h[1] - subs x12, x12, #2 - fmla v30.8h, v0.8h, v4.h[2] - fmla v31.8h, v0.8h, v4.h[3] - add x2, x2, #32 - fmla v28.8h, v2.8h, v5.h[0] - fmla v29.8h, v2.8h, v5.h[1] - add x11, x11, #16 - fmla v30.8h, v2.8h, v5.h[2] - fmla v31.8h, v2.8h, v5.h[3] - bne LoopSz_TILE_4_ONE_OC - -cbz x7, RELU6_TILE_4_ONE_OC -eor v0.16b, v0.16b, v0.16b -fmax v28.8h, v28.8h, v0.8h -fmax v29.8h, v29.8h, v0.8h -fmax v30.8h, v30.8h, v0.8h -fmax v31.8h, v31.8h, v0.8h - -RELU6_TILE_4_ONE_OC: -cbz x8, STORE_TILE_4_ONE_OC -eor v0.16b, v0.16b, v0.16b -movi v1.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) -fmax v28.8h, v28.8h, v0.8h -fmax v29.8h, v29.8h, v0.8h -fmax v30.8h, v30.8h, v0.8h -fmax v31.8h, v31.8h, v0.8h - -fmin v28.8h, v28.8h, v1.8h -fmin v29.8h, v29.8h, v1.8h -fmin v30.8h, v30.8h, v1.8h -fmin v31.8h, v31.8h, v1.8h - -STORE_TILE_4_ONE_OC: -str q28, [x0] -str q29, [x0, #16] -str q30, [x0, #32] -str q31, [x0, #48] - -REAL_END: - -ret - -#endif \ No newline at end of file diff --git a/source/backend/arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S b/source/backend/arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S deleted file mode 100644 index cd188dc8..00000000 --- a/source/backend/arm82/asm/arm64/MNNLineDepthWiseFp16C8Unit.S +++ /dev/null @@ -1,253 +0,0 @@ -// -// MNNLineDepthWiseFp16C8Unit.S -// MNN -// -// Created by MNN on 2019/01/14. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNLineDepthWiseFp16C8Unit -// void MNNLineDepthWiseFp16C8Unit(FLOAT16* dst, const FLOAT16* src, -// const FLOAT16* weight, const FLOAT16* bias_z, size_t width, size_t src_w_step, -// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t relu, size_t relu6) - -// Auto: -// x0: dst, x1:src, x2:weight, x3:bias_z, x4:width -// x5:src_w_step, x6:fw, x7:fh -// Load: -// x8:dilateX_step, x9:dilateY_step, x10:relu, x11:relu6 - -ldr x8, [sp, #0] -ldr x9, [sp, #8] -ldr x10, [sp, #16] -ldr x11, [sp, #24] - -mov x12, #2 // sizeof(fp16) == 2 -mul x5, x12, x5 -mul x8, x12, x8 -mul x9, x12, x9 - -// fw * dilateX_step -mul x12, x6, x8 -sub x9, x9, x12 -ldr q0, [x3] // bias - -L8: -cmp x4, #7 -ble L4 - -mov x12, #8 -mul x12, x5, x12 - -LOOP_TILE_8: - mov v16.16b, v0.16b - mov v17.16b, v0.16b - mov v18.16b, v0.16b - mov v19.16b, v0.16b - mov v20.16b, v0.16b - mov v21.16b, v0.16b - mov v22.16b, v0.16b - mov v23.16b, v0.16b - // x7 -> kh - mov x13, x7 - // keep x1 - mov x3, x1 - // keep x2 - mov x15, x2 - LOOP_TILE_8_KH: - // x6 -> kw - mov x14, x6 - LOOP_TILE_8_KW: - ldr q1, [x2], #16 // weight - ld1 {v24.16b}, [x1], x5 // input - ld1 {v25.16b}, [x1], x5 // input - ld1 {v26.16b}, [x1], x5 - ld1 {v27.16b}, [x1], x5 - fmla v16.8h, v1.8h, v24.8h - fmla v17.8h, v1.8h, v25.8h - subs x14, x14, #1 - fmla v18.8h, v1.8h, v26.8h - fmla v19.8h, v1.8h, v27.8h - ld1 {v28.16b}, [x1], x5 - ld1 {v29.16b}, [x1], x5 - ld1 {v30.16b}, [x1], x5 - ld1 {v31.16b}, [x1], x5 - fmla v20.8h, v1.8h, v28.8h - fmla v21.8h, v1.8h, v29.8h - sub x1, x1, x12 - fmla v22.8h, v1.8h, v30.8h - fmla v23.8h, v1.8h, v31.8h - add x1, x1, x8 - bne LOOP_TILE_8_KW - subs x13, x13, #1 - add x1, x1, x9 - bne LOOP_TILE_8_KH - - sub x4, x4, #8 - cbz x10, LOOP_TILE_8_RELU6 - eor v6.16b, v6.16b, v6.16b - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - fmax v20.8h, v20.8h, v6.8h - fmax v21.8h, v21.8h, v6.8h - fmax v22.8h, v22.8h, v6.8h - fmax v23.8h, v23.8h, v6.8h - - LOOP_TILE_8_RELU6: - cbz x11, STORE_TILE_8 - eor v6.16b, v6.16b, v6.16b - movi v7.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - fmax v20.8h, v20.8h, v6.8h - fmax v21.8h, v21.8h, v6.8h - fmax v22.8h, v22.8h, v6.8h - fmax v23.8h, v23.8h, v6.8h - - fmin v16.8h, v16.8h, v7.8h - fmin v17.8h, v17.8h, v7.8h - fmin v18.8h, v18.8h, v7.8h - fmin v19.8h, v19.8h, v7.8h - fmin v20.8h, v20.8h, v7.8h - fmin v21.8h, v21.8h, v7.8h - fmin v22.8h, v22.8h, v7.8h - fmin v23.8h, v23.8h, v7.8h - - STORE_TILE_8: - mov x2, x15 - str q16, [x0], #16 - str q17, [x0], #16 - str q18, [x0], #16 - str q19, [x0], #16 - add x1, x12, x3 - cmp x4, #8 - str q20, [x0], #16 - str q21, [x0], #16 - str q22, [x0], #16 - str q23, [x0], #16 - bge LOOP_TILE_8 - -L4: -cmp x4, #3 -ble L1 - -mov x12, #4 -mul x12, x5, x12 - -LOOP_TILE_4: - mov v16.16b, v0.16b - mov v17.16b, v0.16b - mov v18.16b, v0.16b - mov v19.16b, v0.16b - // x7 -> kh - mov x13, x7 - mov x3, x1 - mov x15, x2 - LOOP_TILE_4_KH: - // x6 -> kw - mov x14, x6 - LOOP_TILE_4_KW: - ldr q1, [x2], #16 // weight - ld1 {v24.16b}, [x1], x5 // input - ld1 {v25.16b}, [x1], x5 // input - ld1 {v26.16b}, [x1], x5 - ld1 {v27.16b}, [x1], x5 - fmla v16.8h, v1.8h, v24.8h - fmla v17.8h, v1.8h, v25.8h - subs x14, x14, #1 - fmla v18.8h, v1.8h, v26.8h - fmla v19.8h, v1.8h, v27.8h - sub x1, x1, x12 - add x1, x1, x8 - bne LOOP_TILE_4_KW - subs x13, x13, #1 - add x1, x1, x9 - bne LOOP_TILE_4_KH - - sub x4, x4, #4 - cbz x10, LOOP_TILE_4_RELU6 - eor v6.16b, v6.16b, v6.16b - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - - LOOP_TILE_4_RELU6: - cbz x11, STORE_TILE_4 - eor v6.16b, v6.16b, v6.16b - movi v7.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - - fmin v16.8h, v16.8h, v7.8h - fmin v17.8h, v17.8h, v7.8h - fmin v18.8h, v18.8h, v7.8h - fmin v19.8h, v19.8h, v7.8h - - STORE_TILE_4: - mov x2, x15 - str q16, [x0], #16 - str q17, [x0], #16 - str q18, [x0], #16 - str q19, [x0], #16 - add x1, x12, x3 - cmp x4, #4 - bge LOOP_TILE_4 - -L1: -cmp x4, #0 -beq REAL_END - -LOOP_TILE_1: - mov v16.16b, v0.16b - // x7 -> kh - mov x13, x7 - mov x3, x1 - mov x15, x2 - LOOP_TILE_1_KH: - // x6 -> kw - mov x14, x6 - LOOP_TILE_1_KW: - ld1 {v1.16b}, [x2], #16 // weight - ld1 {v24.16b}, [x1], x8 // input - fmla v16.8h, v1.8h, v24.8h - subs x14, x14, #1 - bne LOOP_TILE_1_KW - subs x13, x13, #1 - add x1, x1, x9 - bne LOOP_TILE_1_KH - - cbz x10, LOOP_TILE_1_RELU6 - eor v6.16b, v6.16b, v6.16b - fmax v16.8h, v16.8h, v6.8h - - LOOP_TILE_1_RELU6: - cbz x11, STORE_TILE_1 - eor v6.16b, v6.16b, v6.16b - movi v7.8h, #0x46, lsl #8 // 0x4600 -> fp16(6.0) - fmax v16.8h, v16.8h, v6.8h - fmin v16.8h, v16.8h, v7.8h - - STORE_TILE_1: - subs x4, x4, #1 - mov x2, x15 - str q16, [x0], #16 - add x1, x5, x3 - bne LOOP_TILE_1 - -REAL_END: -ret -#endif diff --git a/source/backend/arm82/asm/arm64/MNNPackC8FP16.S b/source/backend/arm82/asm/arm64/MNNPackC8FP16.S new file mode 100644 index 00000000..b6b30b42 --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNPackC8FP16.S @@ -0,0 +1,92 @@ +// +// MNNPackC8FP16.S +// MNN +// +// Created by MNN on 2020/6/30. +// Copyright © 2020 Alibaba. All rights reserved. +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +asm_function MNNPackC8FP16 +//void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth); +// depth, area -> depthC8, area, 8 +// Auto: x0:dest, x1:source, x2: area, x3: depth +// x4: areaC8, x5:depthC8, x6: sourceStride, x7: destStride + +lsr x4, x2, #3 +lsr x5, x3, #3 +mov x12, #2 // sizeof(FLOAT16) +mov x13, #16 // 8 * sizeof(FLOAT16) +mul x6, x12, x2 +mul x7, x13, x2 +mov x12, #32 +mul x15, x12, x2 + +// [x0, x1, x2, x3] => [x0, x6, x2, x3] =mov=> [x0, x1, x2, x3] +.macro transpose_4x4 x0, x1, x2, x3, x5, x6 +// x0: [00,01,02,03] \ x5:[00,10,02,12] \ x0:[00,10,20,30] +// x1: [10,11,12,13] ===\ x1:[01,11,03,13] ===\ x6:[01,11,21,31] +// x2: [20,21,22,23] ===/ x6:[20,30,22,32] ===/ x2:[02,12,22,32] +// x3: [30,31,32,33] / x3:[21,31,23,33] / x3:[03,13,23,33] + trn1 \x5\().4s, \x0\().4s, \x1\().4s + trn2 \x1\().4s, \x0\().4s, \x1\().4s + trn1 \x6\().4s, \x2\().4s, \x3\().4s + trn2 \x3\().4s, \x2\().4s, \x3\().4s + trn1 \x0\().2d, \x5\().2d, \x6\().2d + trn2 \x2\().2d, \x5\().2d, \x6\().2d + trn1 \x6\().2d, \x1\().2d, \x3\().2d + trn2 \x3\().2d, \x1\().2d, \x3\().2d + mov \x1\().16b, \x6\().16b +.endm + +LoopH: +mov x8, x0 +mov x9, x1 +mov x12, x4 + +LoopL: +mov x10, x9 +ld1 {v16.4s, v17.4s}, [x9], x6 +ld1 {v18.4s, v19.4s}, [x9], x6 +ld1 {v20.4s, v21.4s}, [x9], x6 +ld1 {v22.4s, v23.4s}, [x9], x6 + +ld1 {v24.4s, v25.4s}, [x9], x6 +ld1 {v26.4s, v27.4s}, [x9], x6 +ld1 {v28.4s, v29.4s}, [x9], x6 +ld1 {v30.4s, v31.4s}, [x9], x6 + +transpose_4x4 v16, v18, v20, v22, v0, v1 +transpose_4x4 v17, v19, v21, v23, v2, v3 +transpose_4x4 v24, v26, v28, v30, v4, v5 +transpose_4x4 v25, v27, v29, v31, v6, v7 + +stp q16, q24, [x8], #32 +stp q18, q26, [x8], #32 +stp q20, q28, [x8], #32 +stp q22, q30, [x8], #32 + +stp q17, q25, [x8], #32 +stp q19, q27, [x8], #32 +stp q21, q29, [x8], #32 +stp q23, q31, [x8], #32 + +add x9, x10, #32 + +subs x12, x12, #1 +bne LoopL + + +subs x5, x5, #1 +add x0, x0, x7 +add x1, x1, x15 +bne LoopH + + +ret + +#endif diff --git a/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S new file mode 100644 index 00000000..8ce305d3 --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S @@ -0,0 +1,397 @@ +// +// MNNPackedMatMulFP16.S +// MNN +// +// Created by MNN on 2020/06/10. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 8 * 24 MatMul +asm_function MNNPackedMatMulFP16 +//void MNNPackedMatMulFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias); +// x0: C, x1:A, x2:B, x3:parameter, x5: postParameters, x6:bias +sub sp, sp, #128 +st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 +st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + +//ldr x8, [x3, #0] // deprecated +ldr x9, [x3, #8] // l +ldr x10, [x3, #16] // h + +ldr x13, [x3, #24] // cStride +ldr x7, [x3, #40] // bExtraStride + +// v0, v1, v2: A +// v3, v4: B +// v8 - v31: C +add x10, x10, #7 +lsr x10, x10, #3 + +cbz x4, Start +ld1 {v5.8h}, [x4] +fcvtn v5.4h, v5.4s +dup v6.8h, v5.h[2] // Min Value +dup v7.8h, v5.h[3] // Max Value + +Start: + +cmp x10, #2 +blt LH4 + +LH8: +sub x14, x13, #128 +LoopH: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h, v4.8h}, [x2], #32 + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + + fmul v8.8h, v3.8h, v0.h[0] + fmul v9.8h, v3.8h, v0.h[1] + fmul v10.8h, v3.8h, v0.h[2] + fmul v11.8h, v3.8h, v0.h[3] + fmul v12.8h, v3.8h, v1.h[0] + fmul v13.8h, v3.8h, v1.h[1] + fmul v14.8h, v3.8h, v1.h[2] + fmul v15.8h, v3.8h, v1.h[3] + fmul v16.8h, v3.8h, v2.h[0] + fmul v17.8h, v3.8h, v2.h[1] + fmul v18.8h, v3.8h, v2.h[2] + fmul v19.8h, v3.8h, v2.h[3] + + fmul v20.8h, v4.8h, v0.h[0] + fmul v21.8h, v4.8h, v0.h[1] + fmul v22.8h, v4.8h, v0.h[2] + fmul v23.8h, v4.8h, v0.h[3] + + fmul v24.8h, v4.8h, v1.h[0] + fmul v25.8h, v4.8h, v1.h[1] + fmul v26.8h, v4.8h, v1.h[2] + fmul v27.8h, v4.8h, v1.h[3] + + fmul v28.8h, v4.8h, v2.h[0] + fmul v29.8h, v4.8h, v2.h[1] + fmul v30.8h, v4.8h, v2.h[2] + fmul v31.8h, v4.8h, v2.h[3] + + beq LoopLEnd + + cmp x12, #2 + blt L1 + LoopL2: + ld1 {v3.8h, v4.8h}, [x2], #32 + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + + fmla v8.8h, v3.8h, v0.h[0] + fmla v9.8h, v3.8h, v0.h[1] + fmla v10.8h, v3.8h, v0.h[2] + fmla v11.8h, v3.8h, v0.h[3] + fmla v12.8h, v3.8h, v1.h[0] + fmla v13.8h, v3.8h, v1.h[1] + fmla v14.8h, v3.8h, v1.h[2] + fmla v15.8h, v3.8h, v1.h[3] + fmla v16.8h, v3.8h, v2.h[0] + fmla v17.8h, v3.8h, v2.h[1] + fmla v18.8h, v3.8h, v2.h[2] + fmla v19.8h, v3.8h, v2.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + fmla v24.8h, v4.8h, v1.h[0] + fmla v25.8h, v4.8h, v1.h[1] + fmla v26.8h, v4.8h, v1.h[2] + fmla v27.8h, v4.8h, v1.h[3] + + fmla v28.8h, v4.8h, v2.h[0] + fmla v29.8h, v4.8h, v2.h[1] + fmla v30.8h, v4.8h, v2.h[2] + fmla v31.8h, v4.8h, v2.h[3] + + ld1 {v3.8h, v4.8h}, [x2], #32 + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + + fmla v8.8h, v3.8h, v0.h[0] + fmla v9.8h, v3.8h, v0.h[1] + fmla v10.8h, v3.8h, v0.h[2] + fmla v11.8h, v3.8h, v0.h[3] + fmla v12.8h, v3.8h, v1.h[0] + fmla v13.8h, v3.8h, v1.h[1] + fmla v14.8h, v3.8h, v1.h[2] + fmla v15.8h, v3.8h, v1.h[3] + fmla v16.8h, v3.8h, v2.h[0] + fmla v17.8h, v3.8h, v2.h[1] + fmla v18.8h, v3.8h, v2.h[2] + fmla v19.8h, v3.8h, v2.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + fmla v24.8h, v4.8h, v1.h[0] + fmla v25.8h, v4.8h, v1.h[1] + fmla v26.8h, v4.8h, v1.h[2] + fmla v27.8h, v4.8h, v1.h[3] + + fmla v28.8h, v4.8h, v2.h[0] + fmla v29.8h, v4.8h, v2.h[1] + fmla v30.8h, v4.8h, v2.h[2] + fmla v31.8h, v4.8h, v2.h[3] + sub x12, x12, #2 + cmp x12, #2 + bge LoopL2 + + cbz x12, LoopLEnd + + L1: + ld1 {v3.8h, v4.8h}, [x2], #32 + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + + fmla v8.8h, v3.8h, v0.h[0] + fmla v9.8h, v3.8h, v0.h[1] + fmla v10.8h, v3.8h, v0.h[2] + fmla v11.8h, v3.8h, v0.h[3] + fmla v12.8h, v3.8h, v1.h[0] + fmla v13.8h, v3.8h, v1.h[1] + fmla v14.8h, v3.8h, v1.h[2] + fmla v15.8h, v3.8h, v1.h[3] + fmla v16.8h, v3.8h, v2.h[0] + fmla v17.8h, v3.8h, v2.h[1] + fmla v18.8h, v3.8h, v2.h[2] + fmla v19.8h, v3.8h, v2.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + fmla v24.8h, v4.8h, v1.h[0] + fmla v25.8h, v4.8h, v1.h[1] + fmla v26.8h, v4.8h, v1.h[2] + fmla v27.8h, v4.8h, v1.h[3] + + fmla v28.8h, v4.8h, v2.h[0] + fmla v29.8h, v4.8h, v2.h[1] + fmla v30.8h, v4.8h, v2.h[2] + fmla v31.8h, v4.8h, v2.h[3] + + LoopLEnd: + + add x2, x2, x7 + sub x10, x10, #2 + cmp x10, #2 + + cbz x4, StoreLH8 + + AddBiasLH8: + ld1 {v0.8h, v1.8h}, [x5], #32 + + fmla v8.8h, v0.8h, v5.h[1] + fmla v9.8h, v0.8h, v5.h[1] + fmla v10.8h, v0.8h, v5.h[1] + fmla v11.8h, v0.8h, v5.h[1] + + fmla v12.8h, v0.8h, v5.h[1] + fmla v13.8h, v0.8h, v5.h[1] + fmla v14.8h, v0.8h, v5.h[1] + fmla v15.8h, v0.8h, v5.h[1] + + fmla v16.8h, v0.8h, v5.h[1] + fmla v17.8h, v0.8h, v5.h[1] + fmla v18.8h, v0.8h, v5.h[1] + fmla v19.8h, v0.8h, v5.h[1] + + fmla v20.8h, v1.8h, v5.h[1] + fmla v21.8h, v1.8h, v5.h[1] + fmla v22.8h, v1.8h, v5.h[1] + fmla v23.8h, v1.8h, v5.h[1] + + fmla v24.8h, v1.8h, v5.h[1] + fmla v25.8h, v1.8h, v5.h[1] + fmla v26.8h, v1.8h, v5.h[1] + fmla v27.8h, v1.8h, v5.h[1] + + fmla v28.8h, v1.8h, v5.h[1] + fmla v29.8h, v1.8h, v5.h[1] + fmla v30.8h, v1.8h, v5.h[1] + fmla v31.8h, v1.8h, v5.h[1] + + PostTreatLH8: + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v13.8h, v13.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v15.8h, v15.8h, v6.8h + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v21.8h, v21.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v23.8h, v23.8h, v6.8h + fmax v24.8h, v24.8h, v6.8h + fmax v25.8h, v25.8h, v6.8h + fmax v26.8h, v26.8h, v6.8h + fmax v27.8h, v27.8h, v6.8h + fmax v28.8h, v28.8h, v6.8h + fmax v29.8h, v29.8h, v6.8h + fmax v30.8h, v30.8h, v6.8h + fmax v31.8h, v31.8h, v6.8h + + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v13.8h, v13.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v15.8h, v15.8h, v7.8h + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v21.8h, v21.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v23.8h, v23.8h, v7.8h + fmin v24.8h, v24.8h, v7.8h + fmin v25.8h, v25.8h, v7.8h + fmin v26.8h, v26.8h, v7.8h + fmin v27.8h, v27.8h, v7.8h + fmin v28.8h, v28.8h, v7.8h + fmin v29.8h, v29.8h, v7.8h + fmin v30.8h, v30.8h, v7.8h + fmin v31.8h, v31.8h, v7.8h + + StoreLH8: + + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x14 + + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14 + + bge LoopH + +LH4: +cbz x10, End +LoopHRemain: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h}, [x2] + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + + fmul v8.8h, v3.8h, v0.h[0] + fmul v9.8h, v3.8h, v0.h[1] + add x2, x2, #32 + fmul v10.8h, v3.8h, v0.h[2] + fmul v11.8h, v3.8h, v0.h[3] + fmul v12.8h, v3.8h, v1.h[0] + fmul v13.8h, v3.8h, v1.h[1] + fmul v14.8h, v3.8h, v1.h[2] + fmul v15.8h, v3.8h, v1.h[3] + fmul v16.8h, v3.8h, v2.h[0] + fmul v17.8h, v3.8h, v2.h[1] + fmul v18.8h, v3.8h, v2.h[2] + fmul v19.8h, v3.8h, v2.h[3] + + beq LoopLREnd + + LoopLR: + ld1 {v3.8h}, [x2] + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + + fmla v8.8h, v3.8h, v0.h[0] + fmla v9.8h, v3.8h, v0.h[1] + fmla v10.8h, v3.8h, v0.h[2] + fmla v11.8h, v3.8h, v0.h[3] + add x2, x2, #32 + fmla v12.8h, v3.8h, v1.h[0] + fmla v13.8h, v3.8h, v1.h[1] + fmla v14.8h, v3.8h, v1.h[2] + fmla v15.8h, v3.8h, v1.h[3] + fmla v16.8h, v3.8h, v2.h[0] + fmla v17.8h, v3.8h, v2.h[1] + fmla v18.8h, v3.8h, v2.h[2] + fmla v19.8h, v3.8h, v2.h[3] + + subs x12, x12, #1 + bne LoopLR + LoopLREnd: + + cbz x4, StoreLH4 + AddBiasLH4: + ld1 {v0.8h}, [x5], #16 + + fmla v8.8h, v0.8h, v5.h[1] + fmla v9.8h, v0.8h, v5.h[1] + fmla v10.8h, v0.8h, v5.h[1] + fmla v11.8h, v0.8h, v5.h[1] + + fmla v12.8h, v0.8h, v5.h[1] + fmla v13.8h, v0.8h, v5.h[1] + fmla v14.8h, v0.8h, v5.h[1] + fmla v15.8h, v0.8h, v5.h[1] + + fmla v16.8h, v0.8h, v5.h[1] + fmla v17.8h, v0.8h, v5.h[1] + fmla v18.8h, v0.8h, v5.h[1] + fmla v19.8h, v0.8h, v5.h[1] + + PostTreatLH4: + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v13.8h, v13.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v15.8h, v15.8h, v6.8h + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v13.8h, v13.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v15.8h, v15.8h, v7.8h + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + + StoreLH4: + + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0] + sub x10, x10, #1 + + +End: +sub sp, sp, #128 +ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 +ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + +ret + +#endif diff --git a/source/backend/arm82/asm/arm64/MNNPackedMatMulRemainFP16.S b/source/backend/arm82/asm/arm64/MNNPackedMatMulRemainFP16.S new file mode 100644 index 00000000..4cdfaa36 --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNPackedMatMulRemainFP16.S @@ -0,0 +1,539 @@ +// +// MNNPackedMatMulRemainFP16.S +// MNN +// +// Created by MNN on 2020/06/10. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 8 * 24 MatMul, C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, e), hP = 24 +// Remain meaning is eSize is any value +asm_function MNNPackedMatMulRemainFP16 +//void MNNPackedMatMulRemainFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t eSize, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias); +//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x6:postParameters, x7:bias +// parameter: {aStride, l, h, cStride, bExtraStride} +sub sp, sp, #32 +str x19, [sp, #0] +str x20, [sp, #8] +str x21, [sp, #16] +add sp, sp, #32 +ldr x11, [x4, #0] // aStride +ldr x9, [x4, #8] // l +ldr x10, [x4, #16] // h + +ldr x7, [x4, #24] // cStride +ldr x19, [x4, #40] // bExtraStride + +add x10, x10, #7 +lsr x10, x10, #3 + +cbz x5, Start +ld1 {v5.4s}, [x5] +fcvtn v5.4h, v5.4s +dup v6.8h, v5.h[2] // Min Value +dup v7.8h, v5.h[3] // Max Value + +Start: + +E8: +cmp x3, #8 +blt E4 + +// 8x16 +LoopE8: + mov x20, x6 + mov x8, x10 + mov x21, x0 + mov x13, x2 + + LH8: + cmp x8, #2 + blt LH4 + sub x14, x7, #64 + LoopH8x8: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.8h}, [x15], x11 + fmul v16.8h, v3.8h, v0.h[0] + fmul v17.8h, v3.8h, v0.h[1] + fmul v18.8h, v3.8h, v0.h[2] + fmul v19.8h, v3.8h, v0.h[3] + + fmul v20.8h, v4.8h, v0.h[0] + fmul v21.8h, v4.8h, v0.h[1] + fmul v22.8h, v4.8h, v0.h[2] + fmul v23.8h, v4.8h, v0.h[3] + + fmul v24.8h, v3.8h, v0.h[4] + fmul v25.8h, v3.8h, v0.h[5] + fmul v26.8h, v3.8h, v0.h[6] + fmul v27.8h, v3.8h, v0.h[7] + + fmul v28.8h, v4.8h, v0.h[4] + fmul v29.8h, v4.8h, v0.h[5] + fmul v30.8h, v4.8h, v0.h[6] + fmul v31.8h, v4.8h, v0.h[7] + beq LoopLEnd + + LoopL: + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.8h}, [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + fmla v24.8h, v3.8h, v0.h[4] + fmla v25.8h, v3.8h, v0.h[5] + fmla v26.8h, v3.8h, v0.h[6] + fmla v27.8h, v3.8h, v0.h[7] + + fmla v28.8h, v4.8h, v0.h[4] + fmla v29.8h, v4.8h, v0.h[5] + fmla v30.8h, v4.8h, v0.h[6] + fmla v31.8h, v4.8h, v0.h[7] + + subs x12, x12, #1 + bne LoopL + + LoopLEnd: + + add x13, x13, x19 + sub x8, x8, #2 + + cbz x5, StoreLH8 + AddBiasLH8: + ld1 {v0.8h, v1.8h}, [x20], #32 + + fmla v16.8h, v0.8h, v5.h[1] + fmla v17.8h, v0.8h, v5.h[1] + fmla v18.8h, v0.8h, v5.h[1] + fmla v19.8h, v0.8h, v5.h[1] + + fmla v20.8h, v1.8h, v5.h[1] + fmla v21.8h, v1.8h, v5.h[1] + fmla v22.8h, v1.8h, v5.h[1] + fmla v23.8h, v1.8h, v5.h[1] + + fmla v24.8h, v0.8h, v5.h[1] + fmla v25.8h, v0.8h, v5.h[1] + fmla v26.8h, v0.8h, v5.h[1] + fmla v27.8h, v0.8h, v5.h[1] + + fmla v28.8h, v1.8h, v5.h[1] + fmla v29.8h, v1.8h, v5.h[1] + fmla v30.8h, v1.8h, v5.h[1] + fmla v31.8h, v1.8h, v5.h[1] + + PostTreatLH8: + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v21.8h, v21.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v23.8h, v23.8h, v6.8h + fmax v24.8h, v24.8h, v6.8h + fmax v25.8h, v25.8h, v6.8h + fmax v26.8h, v26.8h, v6.8h + fmax v27.8h, v27.8h, v6.8h + fmax v28.8h, v28.8h, v6.8h + fmax v29.8h, v29.8h, v6.8h + fmax v30.8h, v30.8h, v6.8h + fmax v31.8h, v31.8h, v6.8h + + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v21.8h, v21.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v23.8h, v23.8h, v7.8h + fmin v24.8h, v24.8h, v7.8h + fmin v25.8h, v25.8h, v7.8h + fmin v26.8h, v26.8h, v7.8h + fmin v27.8h, v27.8h, v7.8h + fmin v28.8h, v28.8h, v7.8h + fmin v29.8h, v29.8h, v7.8h + fmin v30.8h, v30.8h, v7.8h + fmin v31.8h, v31.8h, v7.8h + + StoreLH8: + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x14 + + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14 + cmp x8, #2 + bge LoopH8x8 + + LH4: + cbz x8, E8End + LoopHRemain: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h}, [x13] + ld1 {v0.8h}, [x15], x11 + fmul v16.8h, v3.8h, v0.h[0] + fmul v17.8h, v3.8h, v0.h[1] + add x13, x13, #32 + fmul v18.8h, v3.8h, v0.h[2] + fmul v19.8h, v3.8h, v0.h[3] + fmul v20.8h, v3.8h, v0.h[4] + fmul v21.8h, v3.8h, v0.h[5] + fmul v22.8h, v3.8h, v0.h[6] + fmul v23.8h, v3.8h, v0.h[7] + beq LoopLREnd + + LoopLR: + ld1 {v3.8h}, [x13] + ld1 {v0.8h}, [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + add x13, x13, #32 + + fmla v20.8h, v3.8h, v0.h[4] + fmla v21.8h, v3.8h, v0.h[5] + fmla v22.8h, v3.8h, v0.h[6] + fmla v23.8h, v3.8h, v0.h[7] + + subs x12, x12, #1 + bne LoopLR + LoopLREnd: + + cbz x5, StoreLH8x4 + AddBiasLH8x4: + ld1 {v0.8h}, [x20] + + fmla v16.8h, v0.8h, v5.h[1] + fmla v17.8h, v0.8h, v5.h[1] + fmla v18.8h, v0.8h, v5.h[1] + fmla v19.8h, v0.8h, v5.h[1] + + fmla v20.8h, v0.8h, v5.h[1] + fmla v21.8h, v0.8h, v5.h[1] + fmla v22.8h, v0.8h, v5.h[1] + fmla v23.8h, v0.8h, v5.h[1] + + PostTreatLH8x4: + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v21.8h, v21.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v23.8h, v23.8h, v6.8h + + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v21.8h, v21.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v23.8h, v23.8h, v7.8h + + StoreLH8x4: + + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + + E8End: + + sub x3, x3, #8 + add x0, x21, #128 + add x1, x1, #16 + +E4: +cmp x3, #4 +mov x20, x6 +blt E1 + mov x8, x10 + mov x21, x0 + mov x13, x2 + + cmp x8, #2 + blt E4LH4 + + E4LH8: + E4LoopH8: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.4h}, [x15], x11 + fmul v16.8h, v3.8h, v0.h[0] + fmul v17.8h, v3.8h, v0.h[1] + fmul v18.8h, v3.8h, v0.h[2] + fmul v19.8h, v3.8h, v0.h[3] + + fmul v20.8h, v4.8h, v0.h[0] + fmul v21.8h, v4.8h, v0.h[1] + fmul v22.8h, v4.8h, v0.h[2] + fmul v23.8h, v4.8h, v0.h[3] + + beq E4LoopLEnd + + subs x12, x12, #1 + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.4h}, [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + + beq E4LoopLComputeEnd + + E4LoopL: + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.4h}, [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + + subs x12, x12, #1 + bne E4LoopL + E4LoopLComputeEnd: + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + E4LoopLEnd: + add x13, x13, x19 + sub x8, x8, #2 + cmp x8, #2 + + cbz x5, StoreLH4x8 + + AddBiasLH4x8: + ld1 {v0.8h, v1.8h}, [x20], #32 + + fmla v16.8h, v0.8h, v5.h[1] + fmla v17.8h, v0.8h, v5.h[1] + fmla v18.8h, v0.8h, v5.h[1] + fmla v19.8h, v0.8h, v5.h[1] + + fmla v20.8h, v1.8h, v5.h[1] + fmla v21.8h, v1.8h, v5.h[1] + fmla v22.8h, v1.8h, v5.h[1] + fmla v23.8h, v1.8h, v5.h[1] + + PostTreatLH4x8: + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v21.8h, v21.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v23.8h, v23.8h, v6.8h + + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v21.8h, v21.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v23.8h, v23.8h, v7.8h + + StoreLH4x8: + + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x7 + + bge E4LoopH8 + + E4LH4: + cbz x8, E4End + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h}, [x13] + ld1 {v0.4h}, [x15], x11 + fmul v16.8h, v3.8h, v0.h[0] + fmul v17.8h, v3.8h, v0.h[1] + fmul v18.8h, v3.8h, v0.h[2] + fmul v19.8h, v3.8h, v0.h[3] + add x13, x13, #32 + + beq E4LoopLREnd + + E4LoopLR: + ld1 {v3.8h}, [x13] + ld1 {v0.4h}, [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + add x13, x13, #32 + + subs x12, x12, #1 + bne E4LoopLR + E4LoopLREnd: + + cbz x5, StoreLH4x4 + AddBiasLH4x4: + ld1 {v0.8h}, [x20] + + fmla v16.8h, v0.8h, v5.h[1] + fmla v17.8h, v0.8h, v5.h[1] + fmla v18.8h, v0.8h, v5.h[1] + fmla v19.8h, v0.8h, v5.h[1] + + + PostTreatLH4x4: + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + + StoreLH4x4: + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0] + + E4End: + + sub x3, x3, #4 + add x0, x21, #64 + add x1, x1, #8 + +E1: +cmp x3, #0 +beq End + +LoopE1: + mov x20, x6 + mov x8, x10 + mov x21, x0 + mov x13, x2 + + cmp x8, #2 + blt E1LH4 + + E1LH8: + E1LoopH8: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.h}[0], [x15], x11 + fmul v16.8h, v3.8h, v0.h[0] + fmul v20.8h, v4.8h, v0.h[0] + + beq E1LoopLEnd + + E1LoopL: + ld1 {v3.8h, v4.8h}, [x13], #32 + ld1 {v0.h}[0], [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + fmla v20.8h, v4.8h, v0.h[0] + + subs x12, x12, #1 + bne E1LoopL + + E1LoopLEnd: + + add x13, x13, x19 + sub x8, x8, #2 + cmp x8, #2 + + cbz x5, StoreLH1x8 + AddBiasLH1x8: + ld1 {v0.8h, v1.8h}, [x20], #32 + + fmla v16.8h, v0.8h, v5.h[1] + fmla v20.8h, v1.8h, v5.h[1] + + PostTreatLH1x8: + fmax v16.8h, v16.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmin v16.8h, v16.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + + StoreLH1x8: + + st1 {v16.8h}, [x0], x7 + st1 {v20.8h}, [x0], x7 + + bge E1LoopH8 + + E1LH4: + cbz x8, E1End + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.8h}, [x13] + ld1 {v0.h}[0], [x15], x11 + fmul v16.8h, v3.8h, v0.h[0] + add x13, x13, #32 + + beq E1LoopLREnd + + E1LoopLR: + ld1 {v3.8h}, [x13] + ld1 {v0.h}[0], [x15], x11 + fmla v16.8h, v3.8h, v0.h[0] + add x13, x13, #32 + + subs x12, x12, #1 + bne E1LoopLR + E1LoopLREnd: + + cbz x5, StoreLH1x4 + AddBiasLH1x4: + ld1 {v0.8h}, [x20] + fmla v16.8h, v0.8h, v5.h[1] + + PostTreatLH1x4: + fmax v16.8h, v16.8h, v6.8h + fmin v16.8h, v16.8h, v7.8h + + StoreLH1x4: + st1 {v16.8h}, [x0] + + E1End: + + subs x3, x3, #1 + add x0, x21, #16 + add x1, x1, #2 + bne LoopE1 + + +End: +sub sp, sp, #32 +ldr x19, [sp, #0] +ldr x20, [sp, #8] +ldr x21, [sp, #16] +add sp, sp, #32 + +ret + + +#endif diff --git a/source/backend/arm82/asm/arm64/MNNShuffleChannelC8.S b/source/backend/arm82/asm/arm64/MNNShuffleChannelC8.S deleted file mode 100644 index d8ece177..00000000 --- a/source/backend/arm82/asm/arm64/MNNShuffleChannelC8.S +++ /dev/null @@ -1,82 +0,0 @@ -// -// MNNShuffleChannelC8.S -// MNN -// -// Created by MNN on 2020/01/17. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ -#include "MNNAsmGlobal.h" - -.text -.align 5 -asm_function MNNShuffleChannelC8 -// void MNNShuffleChannelC8(FLOAT16* dst, const FLOAT16* src, size_t size, size_t halfFlag) -// Auto: -// x0:dst, x1:src, x2:size, x3:halfFlag - -cbz x3, LOOP_SIZE - -mov x4, #128 -LOOP_SIZE_4: -ldr q0, [x1] -ldr q1, [x1, #16] -ldr q2, [x1, #32] -ldr q3, [x1, #48] -subs x2, x2, #1 -add x1, x1, x4 -st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] -add x0, x0, #64 -bne LOOP_SIZE_4 - -b REAL_END - -LOOP_SIZE: - ldr q0, [x1], #16 - ldr q1, [x1], #16 - ldr q2, [x1], #16 - ldr q3, [x1], #16 - ldr q4, [x1], #16 - ldr q5, [x1], #16 - ldr q6, [x1], #16 - ldr q7, [x1], #16 - zip1 v16.8h, v0.8h, v4.8h - zip1 v17.8h, v2.8h, v6.8h - zip1 v18.8h, v1.8h, v5.8h - zip1 v19.8h, v3.8h, v7.8h - - zip1 v24.8h, v16.8h, v17.8h - zip1 v25.8h, v18.8h, v19.8h - zip2 v26.8h, v16.8h, v17.8h - zip2 v27.8h, v18.8h, v19.8h - - zip1 v28.8h, v24.8h, v25.8h - zip2 v29.8h, v24.8h, v25.8h - zip1 v30.8h, v26.8h, v27.8h - zip2 v31.8h, v26.8h, v27.8h - - st1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64 - // ----- - zip2 v20.8h, v0.8h, v4.8h - zip2 v21.8h, v2.8h, v6.8h - zip2 v22.8h, v1.8h, v5.8h - zip2 v23.8h, v3.8h, v7.8h - - zip1 v24.8h, v20.8h, v21.8h - zip1 v25.8h, v22.8h, v23.8h - zip2 v26.8h, v20.8h, v21.8h - zip2 v27.8h, v22.8h, v23.8h - - subs x2, x2, #1 - zip1 v28.8h, v24.8h, v25.8h - zip2 v29.8h, v24.8h, v25.8h - zip1 v30.8h, v26.8h, v27.8h - zip2 v31.8h, v26.8h, v27.8h - st1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64 - bne LOOP_SIZE - - -REAL_END: -ret -#endif \ No newline at end of file diff --git a/source/backend/cpu/BinaryUtils.hpp b/source/backend/cpu/BinaryUtils.hpp index 04c4a1e6..1d08ea49 100644 --- a/source/backend/cpu/BinaryUtils.hpp +++ b/source/backend/cpu/BinaryUtils.hpp @@ -2,128 +2,128 @@ #include template -struct BinaryMax : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryMax { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return std::max(x, y); } }; template -struct BinaryMin : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryMin { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return std::min(x, y); } }; template -struct BinaryMul : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryMul { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return x * y; } }; template -struct BinaryAdd : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryAdd { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return x + y; } }; template -struct BinarySub : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinarySub { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return x - y; } }; template -struct BinaryRealDiv : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryRealDiv { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return x / y; } }; template -struct BinaryMod : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryMod { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return x - x / y; } }; template -struct BinaryGreater : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryGreater { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x > y) ? 1 : 0); } }; template -struct BinaryLess : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryLess { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x < y) ? 1 : 0); } }; template -struct BinaryGreaterEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryGreaterEqual { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x >= y) ? 1 : 0); } }; template -struct BinaryLessEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryLessEqual { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x <= y) ? 1 : 0); } }; template -struct BinaryEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryEqual { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x == y) ? 1 : 0); } }; template -struct BinaryFloorDiv : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryFloorDiv { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return floor(static_cast(x) / y); } }; template -struct BinaryFloorMod : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryFloorMod { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return x - floor(x / y) * y; } }; template -struct BinarySquaredDifference : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinarySquaredDifference { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (x - y) * (x - y); } }; template -struct BinaryPow : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryPow { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return pow(x, y); } }; template -struct BinaryAtan2 : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryAtan2 { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return atan(x / y); } }; template -struct BinaryLogicalOr : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryLogicalOr { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x || y) ? 1 : 0); } }; template -struct BinaryNotEqual : std::binary_function<_Arg1, _Arg2, _ErrorCode> { +struct BinaryNotEqual { _ErrorCode operator()(const _Arg1& x, const _Arg2& y) const { return (_ErrorCode)((x != y) ? 1 : 0); } diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt new file mode 100644 index 00000000..c82ddc5d --- /dev/null +++ b/source/backend/cpu/CMakeLists.txt @@ -0,0 +1,34 @@ +# CPU +option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF) +FILE(GLOB MNN_CPU_SRC ${CMAKE_CURRENT_LIST_DIR}/* ${CMAKE_CURRENT_LIST_DIR}/compute/*) +add_library(MNNCPU OBJECT ${MNN_CPU_SRC}) +if (MNN_SUPPORT_BF16) + include(${CMAKE_CURRENT_LIST_DIR}/bf16/CMakeLists.txt) + list(APPEND MNN_TARGETS MNN_BF16) + list(APPEND MNN_OBJECTS_TO_LINK $) + add_definitions(-DMNN_SUPPORT_BF16) # MNNCPU and MNNARM32 need to know flag MNN_SUPPORT_BF16 +endif() +list(APPEND MNN_OBJECTS_TO_LINK $) +list(APPEND MNN_TARGETS MNNCPU) +option(MNN_SSE_USE_FP16_INSTEAD "Use fp16 instead of bf16 for x86op" OFF) + +# X86_64 AVX/SSE +if (MNN_USE_SSE) + include(${CMAKE_CURRENT_LIST_DIR}/x86_x64/CMakeLists.txt) +endif() + +# AArch32/64 Assemblies +include(${CMAKE_CURRENT_LIST_DIR}/arm/CMakeLists.txt) + +IF(NOT DEFINED IOS_ARCH) + set(IOS_ARCH "") +ENDIF() + +# ARM82 Assemblies +IF(MNN_ARM82) + target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82) + include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt) + list(APPEND MNN_TARGETS MNN_Arm82) + list(APPEND MNN_OBJECTS_TO_LINK $) +ENDIF() + diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp index 0841e045..cda6705a 100644 --- a/source/backend/cpu/CPUBackend.cpp +++ b/source/backend/cpu/CPUBackend.cpp @@ -10,27 +10,31 @@ #include #include #include "core/BufferAllocator.hpp" -#include "backend/cpu/CPUTensorConvert.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" -#include "core/TensorUtils.hpp" -#include "backend/cpu/ThreadPool.hpp" -#include "shape/SizeComputer.hpp" +#include "CPUTensorConvert.hpp" #include "compute/CommonOptFunction.h" +#include "core/TensorUtils.hpp" +#include "ThreadPool.hpp" +#include "core/Concurrency.h" +#include "compute/Int8FunctionsOpt.h" +#include "CPUCast.hpp" +#include "core/OpCommonUtils.hpp" #ifdef _OPENMP #include #endif // _OPENMP #include "backend/cpu/CPURuntime.hpp" -#if defined(__aarch64__) && ENABLE_ARMV82 +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) #include "backend/arm82/Arm82Backend.hpp" #endif #define MAX_THREAD_NUMBER 32 #define LARGE_MEMORY 1024 * 1024 * 500 +#ifdef MNN_SUPPORT_BF16 +#include "bf16/BF16Backend.hpp" +#endif -//#define MNN_DUMP_MEMORY_USAGE #define MNN_CPU_CHECK_NAN 1 namespace MNN { void registerCPUOps(); -#if defined(__aarch64__) && ENABLE_ARMV82 +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) struct cpuinfo_arm_isa gCPUInfo; #endif @@ -44,7 +48,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) { mPrecision = BackendConfig::Precision_Normal; mFlags = 0; mFlops = MNNGetCPUFlops(mThreadNumber); -#if defined(__aarch64__) && ENABLE_ARMV82 +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) mIsSupportDot = gCPUInfo.dot; mIsSupportFp16arith = gCPUInfo.fp16arith; #endif @@ -90,29 +94,33 @@ float CPURuntime::onGetMemoryInMB() { auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f; return staticMemoryInMB; } -Backend* CPURuntime::onCreate() const{ -#if defined(__aarch64__) && ENABLE_ARMV82 - if (mIsSupportFp16arith && mPrecision == BackendConfig::Precision_Low) { +Backend* CPURuntime::onCreate(const BackendConfig* config) const { + auto precision = mPrecision; + if (nullptr != config) { + precision = config->precision; + } +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) + if (mIsSupportFp16arith && precision == BackendConfig::Precision_Low) { return new Arm82Backend(this); } #endif - return new CPUBackend(this); +#ifdef MNN_SUPPORT_BF16 + if (precision == BackendConfig::Precision_Low) { + return new BF16Backend(this); + } +#endif + return new CPUBackend(this, precision); } void CPURuntime::onGabageCollect(int level) { mStaticAllocator->release(false); } std::map* CPUBackend::gCreator = nullptr; - void CPUBackend::initCreatorMap() { gCreator = new std::map; } -std::map* CPUBackend::getCreatorMap() { - return gCreator; -} - bool CPUBackend::addCreator(OpType t, Creator* c) { - auto map = getCreatorMap(); + auto map = gCreator; if (map->find(t) != map->end()) { MNN_PRINT("Error: %d type has be added\n", t); return false; @@ -121,12 +129,14 @@ bool CPUBackend::addCreator(OpType t, Creator* c) { return true; } -CPUBackend::CPUBackend(const CPURuntime* runtime, MNNForwardType type) : Backend(type) { +CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, MNNForwardType type) : Backend(type) { mRuntime = runtime; mCheckNAN = runtime->mFlags == MNN_CPU_CHECK_NAN; std::shared_ptr defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get())); mDynamicAllocator.reset(new BufferAllocator(defaultAlloc)); mStaticAllocator = runtime->mStaticAllocator; + mPrecisionMode = precision; + mCoreFunctions = MNNGetCoreFunctions(); } bool CPUBackend::supportDot() const { return mRuntime->mIsSupportDot; @@ -159,12 +169,13 @@ void CPUBackend::onExecuteEnd() const { bool CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) { // MNN_PRINT("Acquire size = %d\n", size); if (size <= 0) { + MNN_PRINT("Acquire buffer size = %d\n", size); MNN_ASSERT(false); return false; } - if (size > LARGE_MEMORY) { - MNN_PRINT("Size larger than 500 M :%d\n", size); - } + // if (size > LARGE_MEMORY) { + // MNN_PRINT("Size larger than 500 M :%d\n", size); + // } auto& buffer = dest->buffer(); auto des = TensorUtils::getDescribe(dest); std::pair points; @@ -233,18 +244,65 @@ bool CPUBackend::onReleaseBuffer(const MNN::Tensor* nativeTensor, StorageType st std::pair CPUBackend::onMeasure(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op) { - auto map = getCreatorMap(); + auto map = gCreator; auto iter = map->find(op->type()); if (iter == map->end()) { MNN_PRINT("Don't support type %s, %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str()); return std::make_pair(0.0f, false); } -#ifndef MNN_BUILD_MINI - auto computeFlops = SizeComputer::computeFlops(op, inputs, outputs); - return std::make_pair(computeFlops / mRuntime->mFlops * 1000.0f, true); -#else + // FIXME: Compute in future return std::make_pair(0.0f, false); -#endif +} + +halide_type_t CPUBackend::getRunType(const Op* op, halide_type_t qtype, halide_type_t rtype) { + auto otype = op->type(); + switch (otype) { + case OpType_Convolution: + case OpType_ConvolutionDepthwise: + case OpType_Eltwise: + case OpType_Raster: + return qtype; + case OpType_ReLU: + // now just relu without slope support quant + if ((op->main_as_Relu() == nullptr) || op->main_as_Relu()->slope() == 0.f) { + return qtype; + } else { + return rtype; + } + /* + case OpType_Pooling: + // now just maxpool support quant + if (op->main_as_Pool() && op->main_as_Pool()->type() == PoolType_MAXPOOL) { + return qtype; + } else { + return defaultType; + } + */ + default: + return rtype; + } +} + +OpType CPUBackend::getRealOpType(OpType opType, halide_type_t dataType) { + // now just support int8 + if (dataType != halide_type_of()) { + return opType; + } + switch (opType) { + case OpType_Convolution: + return OpType_ConvInt8; + case OpType_ConvolutionDepthwise: + return OpType_DepthwiseConvInt8; + /* + case OpType_Pooling: + return OpType_PoolInt8; + */ + case OpType_Eltwise: + // TODO: just support EltwiseAdd + return OpType_EltwiseInt8; + default: + return opType; + } } /// get execution @@ -257,15 +315,238 @@ Execution* CPUBackend::onCreate(const std::vector& inputs, const std::v if (op->type() == OpType_BatchNorm) { return nullptr; } - auto map = getCreatorMap(); - auto iter = map->find(op->type()); + // get QuantType and RunType, default is float + halide_type_t quantType = halide_type_of(); + auto isQuant = OpCommonUtils::getQuantInfo(inputs); + if (isQuant.first) { + // if output hasnt scale, using output type + if (TensorUtils::getDescribe(outputs[0])->quantAttr == nullptr && !outputs.empty()) { + quantType = outputs[0]->getType(); + } else { + quantType = TensorUtils::DataTypeToHalideType(isQuant.second); + } + } + auto originType = outputs.empty() ? halide_type_of() : outputs[0]->getType(); + auto runType = getRunType(op, quantType, originType); + // TODO: rm this convert when merge diff datatyoe of op + auto opType = op->type(); + if (isQuant.first) { + opType = getRealOpType(opType, runType); + } + auto map = gCreator; + auto iter = map->find(opType); if (iter == map->end()) { MNN_PRINT("Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str()); return nullptr; } - auto exe = iter->second->onCreate(inputs, outputs, op, this); + Execution* exe = nullptr; + if (isQuant.first) { + bool needCast = false; + // judge is it need CastWrap + if (OpType_Raster == opType) { + inputs[0]->setType(TensorUtils::HaildeTypeToDataType(runType)); + for (const auto& r : TensorUtils::getDescribe(inputs[0])->regions) { + needCast |= (r.origin->getType() != runType); + } + } else { + for (int i = 0; i < inputs.size(); i++) { + if (OpCommonUtils::opNeedContent(opType, i) && inputs[i]->getType() != halide_type_of()) { + needCast |= (inputs[i]->getType() != runType); + } + } + } + // set output Tensor Type + auto outputType = TensorUtils::HaildeTypeToDataType(runType); + for (auto output : outputs) { + if (output->getType() != runType) { + output->setType(outputType); + needCast = true; + } + } + if (needCast) { + class CastWrapExecution : public Execution { + public: + CastWrapExecution(Backend* backend, halide_type_t runT, const Op* op, std::map& cachedCastTensor, Execution* exe) + : Execution(backend), runType(runT), mOp(op), mCachedCastTensor(cachedCastTensor), mExecution(exe) {} + CastWrapExecution(const CPUBackend::Creator* creator, const Op* op, Backend* backend, + const std::vector &inputs, const std::vector &outputs, + halide_type_t runT, std::map& cachedCastTensor) + : Execution(backend), runType(runT), mCreator(creator), mOp(op), + mCachedCastTensor(cachedCastTensor), mInputs(inputs) { + std::vector types(inputs.size()); + for (int i = 0; i < inputs.size(); i++) { + types[i] = TensorUtils::HaildeTypeToDataType(inputs[i]->getType()); + inputs[i]->setType(TensorUtils::HaildeTypeToDataType(runType)); + } + mExecution.reset(mCreator->onCreate(inputs, outputs, mOp, backend)); + for (int i = 0; i < inputs.size(); i++) { + inputs[i]->setType(types[i]); + } + } + virtual ErrorCode onResize(const std::vector& inputs, + const std::vector& outputs) override { + for (auto output : outputs) { + output->setType(TensorUtils::HaildeTypeToDataType(runType)); + } + mWrapInputTensors.clear(); + mWrapInputs.clear(); + mCasts.clear(); + mScales.clear(); + std::vector realInput; + if (mOp->type() == OpType_Raster) { + for (const auto& r : TensorUtils::getDescribe(inputs[0])->regions) { + realInput.push_back(r.origin); + } + } else { + realInput = inputs; + } + for (int i = 0; i < realInput.size(); i++) { + auto input = realInput[i]; + if (input->getType() == runType || !OpCommonUtils::opNeedContent(mOp->type(), i) || input->getType() == halide_type_of()) { + mWrapInputs.push_back(input); + continue; + } + if (mCachedCastTensor.find(input) != mCachedCastTensor.end()) { + mWrapInputs.push_back(const_cast(mCachedCastTensor[input])); + continue; + } + std::unique_ptr wrapTensor(new Tensor); + TensorUtils::copyShape(input, wrapTensor.get(), true); + TensorUtils::getDescribe(wrapTensor.get())->quantAttr = TensorUtils::getDescribe(input)->quantAttr; + wrapTensor->buffer().type = runType; + bool memoryAllocSuccess = backend()->onAcquireBuffer(wrapTensor.get(), Backend::DYNAMIC); + if (!memoryAllocSuccess) { + return {}; + } + mWrapInputs.push_back(wrapTensor.get()); + auto wrapPointer = wrapTensor.get(); + mCasts.insert(std::make_pair(input, wrapTensor.get())); + mCachedCastTensor.insert(std::make_pair(input, wrapTensor.get())); + mWrapInputTensors.emplace_back(std::move(wrapTensor)); + mScales[input] = std::vector(4); + auto& quantAttr = TensorUtils::getDescribe(input)->quantAttr; + float scale = runType == halide_type_of() ? quantAttr->scale : 1/quantAttr->scale; + // set 4xscale for SSE compute + mScales[input][0] = scale; + mScales[input][1] = scale; + mScales[input][2] = scale; + mScales[input][3] = scale; + } + ErrorCode res = NO_ERROR; + if (mOp->type() == OpType_Raster) { + mRasterInput = inputs[0]; + if (mCasts.size() > 0) { + mRasterInputTensor.reset(new Tensor(inputs[0], inputs[0]->getDimensionType(), false)); + mRasterInput = mRasterInputTensor.get(); + TensorUtils::getDescribe(mRasterInput)->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; + TensorUtils::getDescribe(mRasterInput)->regions.resize(realInput.size()); + for (int i = 0; i < realInput.size(); i++) { + TensorUtils::getDescribe(mRasterInput)->regions[i] = TensorUtils::getDescribe(inputs[0])->regions[i]; + TensorUtils::getDescribe(mRasterInput)->regions[i].origin = mWrapInputs[i]; + } + } + res = mExecution->onResize({mRasterInput}, outputs); + } else { + res = mExecution->onResize(mWrapInputs, outputs); + } + for (auto& iter : mCasts) { + if (TensorUtils::getDescribe(iter.first)->useCount <= 1) { + backend()->onReleaseBuffer(iter.second, Backend::DYNAMIC); + } + } + return res; + } + + virtual ErrorCode onExecute(const std::vector& inputs, + const std::vector& outputs) override { + for (const auto& iter : mCasts) { + auto input = iter.first; + auto output = iter.second; + auto& quantAttr = TensorUtils::getDescribe(input)->quantAttr; + MNN_ASSERT(quantAttr != nullptr); + auto numberThread = ((CPUBackend*)backend())->threadNumber(); + if (numberThread == 1) { + CPUCastCreator::cast(input, output); + continue; + } + int size = input->elementSize(); + int sizeQuad = size / 16; + int remain = sizeQuad * 16; + int sizeDivide = sizeQuad / numberThread; + auto scale = mScales[input].data(); + if (runType == halide_type_of()) { + const auto inputDataPtr = input->host(); + auto outputDataPtr = output->host(); + if (sizeQuad > 0) { + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + int number = sizeDivide; + if (tId == numberThread - 1) { + number = sizeQuad - tId * sizeDivide; + } + const auto srcChannelPtr = inputDataPtr + tId * sizeDivide * 16; + auto dstChannlePtr = outputDataPtr + tId * sizeDivide * 16; + MNNInt8ScaleToFloat(dstChannlePtr, srcChannelPtr, scale, sizeDivide * 4, quantAttr->zero); + } + MNN_CONCURRENCY_END(); + } + for (int i = remain; i < size; i++) { + outputDataPtr[i] = static_cast(std::min(std::max(inputDataPtr[i] * scale[0], quantAttr->min), quantAttr->max)); + } + } else { + const auto inputDataPtr = input->host(); + auto outputDataPtr = output->host(); + if (sizeQuad > 0) { + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + int number = sizeDivide; + if (tId == numberThread - 1) { + number = sizeQuad - tId * sizeDivide; + } + const auto srcChannelPtr = inputDataPtr + tId * sizeDivide * 16; + auto dstChannlePtr = outputDataPtr + tId * sizeDivide * 16; + MNNFloat2Int8(srcChannelPtr, dstChannlePtr, sizeDivide * 4, scale, quantAttr->min, quantAttr->max, quantAttr->zero); + } + MNN_CONCURRENCY_END(); + } + for (int i = remain; i < size; i++) { + outputDataPtr[i] = static_cast(inputDataPtr[i]) * scale[0]; + } + } + } + if (mOp->type() == OpType_Raster) { + return mExecution->onExecute({ mRasterInput }, outputs); + } else { + return mExecution->onExecute(mWrapInputs, outputs); + } + } + virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { + if (dst == nullptr || bn == nullptr) { + return true; + } + Execution* exe; + mExecution->onClone(bn, op, &exe); + *dst = new CastWrapExecution(bn, runType, op, mCachedCastTensor, exe); + return true; + }; + private: + const Op* mOp; + const CPUBackend::Creator* mCreator; + halide_type_t runType; + std::shared_ptr mExecution; + Tensor* mRasterInput; + std::vector mWrapInputs, mInputs; + std::unique_ptr mRasterInputTensor; + std::vector> mWrapInputTensors; + std::map mCasts, &mCachedCastTensor; + std::map> mScales; + bool firstResize = true; + }; + exe = new CastWrapExecution(iter->second, op, this, inputs, outputs, runType, mCachedCastTensor); + } + } + if (exe == nullptr) { + exe = iter->second->onCreate(inputs, outputs, op, this); + } if (nullptr == exe) { - MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str()); return nullptr; } if (mCheckNAN) { @@ -289,6 +570,9 @@ Execution* CPUBackend::onCreate(const std::vector& inputs, const std::v if (halide_type_float != tensor->getType().code) { return NO_ERROR; } + if (TensorUtils::getDescribe(tensor)->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) { + return NO_ERROR; + } auto size = tensor->elementSize(); auto ptr = tensor->host(); for (int i = 0; i < size; ++i) { @@ -328,12 +612,13 @@ Execution* CPUBackend::onCreate(const std::vector& inputs, const std::v bool CPUBackend::onClearBuffer() { mDynamicAllocator->release(true); + mCachedCastTensor.clear(); return true; } std::pair CPUBackend::multiThreadDivide(int size) const { int sizeDivide = size / threadNumber(); - sizeDivide = UP_DIV(sizeDivide, 4) * 4; + sizeDivide = UP_DIV(sizeDivide, mCoreFunctions->pack) * mCoreFunctions->pack; int scheduleNumber = 1; if (sizeDivide > 0) { scheduleNumber = UP_DIV(size, sizeDivide); @@ -345,7 +630,6 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) auto& dstBuffer = dstTensor->buffer(); MNN_ASSERT(srcBuffer.dimensions == dstBuffer.dimensions); - MNN_ASSERT(srcBuffer.type == dstBuffer.type); if (srcTensor->getDimensionType() == dstTensor->getDimensionType()) { for (int i = 0; i < srcBuffer.dimensions; ++i) { MNN_ASSERT(srcBuffer.dim[i].extent <= dstBuffer.dim[i].extent); @@ -354,10 +638,17 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) if (nullptr == srcBuffer.host || nullptr == dstBuffer.host) { return; } - + if (srcBuffer.type != dstBuffer.type) { + auto code = CPUCastCreator::cast(srcTensor, dstTensor); + if (NO_ERROR != code) { + MNN_ERROR("Error in CPUBackend::onCopyBuffer:cast\n"); + return; + } + srcTensor = dstTensor; + } auto code = CPUTensorConverter::convert(srcTensor, dstTensor); if (NO_ERROR != code) { - MNN_ERROR("Error in CPUBackend::onCopyBuffer\n"); + MNN_ERROR("Error in CPUBackend::onCopyBuffer:convert\n"); } } @@ -369,11 +660,18 @@ public: }; +#ifdef MNN_SUPPORT_BF16 +extern void registerBF16Backend(); +#endif void registerCPURuntimeCreator() { CPUBackend::initCreatorMap(); registerCPUOps(); - MNNFunctionInit(); -#if defined(__aarch64__) && ENABLE_ARMV82 +#ifdef MNN_SUPPORT_BF16 + registerBF16Backend(); +#endif + // TODO: Merge _initCoreFunction MNNFunctionInit and cpuinfo_arm_init + MNNCoreFunctionInit(); +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) cpuinfo_arm_init(&gCPUInfo); #endif MNNInsertExtraRuntimeCreator(MNN_FORWARD_CPU, new CPURuntimeCreator); diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp index a920461b..e461d12b 100644 --- a/source/backend/cpu/CPUBackend.hpp +++ b/source/backend/cpu/CPUBackend.hpp @@ -9,7 +9,6 @@ #ifndef CPUBackend_hpp #define CPUBackend_hpp -#include #include #include #include "core/Backend.hpp" @@ -29,7 +28,7 @@ public: bool supportFp16() const { return mIsSupportFp16arith; } - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* config) const override; virtual void onGabageCollect(int level) override; virtual float onGetMemoryInMB() override; private: @@ -48,10 +47,11 @@ private: float mFlops = 0.0f; static Backend*(*gExtraCreate)(const Runtime* runtime); }; +struct CoreFunctions; class CPUBackend : public Backend { public: - CPUBackend(const CPURuntime* runtime, MNNForwardType type = MNN_FORWARD_CPU); + CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, MNNForwardType type = MNN_FORWARD_CPU); virtual ~CPUBackend(); // Return sizeDivide, scheduleNumber aligned memory @@ -68,7 +68,10 @@ public: const MNN::Op* op) override; virtual void onExecuteBegin() const override; virtual void onExecuteEnd() const override; - + + const CoreFunctions* functions() const { + return mCoreFunctions; + } public: class Creator { public: @@ -89,21 +92,28 @@ public: BackendConfig::MemoryMode memoryMode() const { return mRuntime->mMemory; } + BackendConfig::PrecisionMode precisionMode() const { + return mPrecisionMode; + } #ifdef MNN_USE_THREAD_POOL inline int taskIndex() const {return mRuntime->mTaskIndex;} #endif bool supportDot() const; static void initCreatorMap(); - + halide_type_t getRunType(const Op* op, halide_type_t qtype, halide_type_t rtype) override; +private: + OpType getRealOpType(OpType opType, halide_type_t dataType); protected: bool allocBuffer(int size, Tensor* dest, StorageType storageType); + const CoreFunctions* mCoreFunctions; private: std::shared_ptr mStaticAllocator; std::shared_ptr mDynamicAllocator; bool mCheckNAN = false; const CPURuntime* mRuntime; - static std::map* getCreatorMap(); + BackendConfig::PrecisionMode mPrecisionMode; static std::map* gCreator; + std::map mCachedCastTensor; }; #define REGISTER_CPU_OP_CREATOR(name, opType) \ diff --git a/source/backend/cpu/CPUBatchMatMul.cpp b/source/backend/cpu/CPUBatchMatMul.cpp index eb0e299f..88f71498 100644 --- a/source/backend/cpu/CPUBatchMatMul.cpp +++ b/source/backend/cpu/CPUBatchMatMul.cpp @@ -12,7 +12,7 @@ #include "core/TensorUtils.hpp" #include "core/BufferAllocator.hpp" #include "core/Concurrency.h" - +#include "compute/CommonOptFunction.h" namespace MNN { CPUBatchMatMul::CPUBatchMatMul(Backend* backend, bool adjX, bool adjY) : Execution(backend) { @@ -79,9 +79,10 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector& inputs, const st auto input0 = inputs[0]; auto input1 = inputs[1]; auto output = outputs[0]; + auto core = static_cast(backend())->functions(); // Fill output by zero if one of inputs is empty. if (input0->elementSize() == 0 || input1->elementSize() == 0) { - ::memset(output->host(), 0, output->size()); + ::memset(output->host(), 0, output->elementSize() * core->bytes); return NO_ERROR; } const int dimensions = input0->dimensions(); @@ -89,9 +90,9 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector& inputs, const st const int input0Stride = input0->length(dimensions - 1) * input0->length(dimensions - 2); const int input1Stride = input1->length(dimensions - 1) * input1->length(dimensions - 2); const int outputStride = output->length(dimensions - 1) * output->length(dimensions - 2); - const auto input0Ptr = input0->host(); - const auto input1Ptr = input1->host(); - float* const outputPtr = output->host(); + auto input0Ptr = input0->host(); + auto input1Ptr = input1->host(); + auto outputPtr = output->host(); int threadNumber = static_cast(backend())->threadNumber(); if (threadNumber > mBatch) { threadNumber = mBatch; @@ -99,9 +100,9 @@ ErrorCode CPUBatchMatMul::onExecute(const std::vector& inputs, const st MNN_CONCURRENCY_BEGIN(tId, threadNumber) { auto& unit = mUnits[tId]; for (int i = (int)tId; i < mBatch; i+=threadNumber) { - unit.mMatrixA->buffer().host = (uint8_t*)(input0Ptr + i * input0Stride); - unit.mMatrixB->buffer().host = (uint8_t*)(input1Ptr + i * input1Stride); - unit.mMatrixC->buffer().host = (uint8_t*)(outputPtr + i * outputStride); + unit.mMatrixA->buffer().host = (uint8_t*)(input0Ptr + i * input0Stride * core->bytes); + unit.mMatrixB->buffer().host = (uint8_t*)(input1Ptr + i * input1Stride * core->bytes); + unit.mMatrixC->buffer().host = (uint8_t*)(outputPtr + i * outputStride * core->bytes); unit.mMatMul->onExecute(unit.mTempInputs, unit.mTempOutputs); } } diff --git a/source/backend/cpu/CPUCast.cpp b/source/backend/cpu/CPUCast.cpp index fb059383..f0ca216a 100644 --- a/source/backend/cpu/CPUCast.cpp +++ b/source/backend/cpu/CPUCast.cpp @@ -7,9 +7,57 @@ // #include "backend/cpu/CPUCast.hpp" +#include "core/TensorUtils.hpp" #include "core/Macro.h" +#include "backend/cpu/compute/Int8FunctionsOpt.h" namespace MNN { +ErrorCode CPUCastCreator::cast(void* const inputRaw, void* outputRaw, halide_type_t inputType, halide_type_t outputType, + int number, float scale, float zero, float min, float max) { + int c4Size = number / 4; + int remain = c4Size * 4; + std::vector scales(4, scale); + if (inputType == halide_type_of() && outputType == halide_type_of()) { + std::for_each(scales.begin(), scales.end(), [](float& x){ x = x == 0.f ? 0.f : 1 / x; }); + MNNFloat2Int8(static_cast(inputRaw), static_cast(outputRaw), c4Size, scales.data(), min, max, zero); + for (int i = remain; i < number; i++) { + float x = static_cast(inputRaw)[i] * scale; + static_cast(outputRaw)[i] = std::max(std::min(x, max), min);; + } + return NO_ERROR; + } + if (inputType == halide_type_of() && outputType == halide_type_of()) { + MNNInt8ScaleToFloat(static_cast(outputRaw), static_cast(inputRaw), scales.data(), c4Size, zero); + for (int i = remain; i < number; i++) { + static_cast(outputRaw)[i] = static_cast(inputRaw)[i] * scale; + } + return NO_ERROR; + } + MNN_ERROR("Don't support cast type \n"); + return NOT_SUPPORT; +} +ErrorCode CPUCastCreator::cast(const Tensor* input, const Tensor* output) { + auto srcT = input->getType(); + auto dstT = output->getType(); + auto ib = input->buffer(); + auto ob = output->buffer(); + if (srcT == dstT) { + ::memcpy(ib.host, ob.host, input->size()); + return NO_ERROR; + } + auto& quantAttr = TensorUtils::getDescribe(input)->quantAttr; + if (quantAttr == nullptr) { + MNN_ERROR("No quant info for Cast\n"); + return INVALID_VALUE; + } + int totalSize = input->elementSize(); + auto code = cast(ib.host, ob.host, srcT, dstT, totalSize, quantAttr->scale, quantAttr->zero, quantAttr->min, quantAttr->max); + if (NO_ERROR != code) { + MNN_ERROR("Error in CPUCast\n"); + return code; + } + return NO_ERROR; +} template class CastDataType : public Execution { diff --git a/source/backend/cpu/CPUCast.hpp b/source/backend/cpu/CPUCast.hpp index 92b7b717..f1ce00cd 100644 --- a/source/backend/cpu/CPUCast.hpp +++ b/source/backend/cpu/CPUCast.hpp @@ -16,6 +16,8 @@ class CPUCastCreator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override; + static ErrorCode cast(const Tensor* input, const Tensor* output); + static ErrorCode cast(void* const inputRaw, void* outputRaw, halide_type_t inputType, halide_type_t outputType, int number, float scale, float zero, float min, float max); }; } // namespace MNN #endif /* CPUCast_hpp */ diff --git a/source/backend/cpu/CPUConvArm82Int8.cpp b/source/backend/cpu/CPUConvArm82Int8.cpp index e970b7bd..b5250c17 100644 --- a/source/backend/cpu/CPUConvArm82Int8.cpp +++ b/source/backend/cpu/CPUConvArm82Int8.cpp @@ -6,6 +6,7 @@ // Copyright © 2018, Alibaba Group Holding Limited // +// MNNGemmInt8AddBiasScale_ARMV82_Unit.S is only available when arm64 now, so don't change this #if defined(__aarch64__) && defined(ENABLE_ARMV82) #include "CPUConvArm82Int8.hpp" #include "compute/Int8FunctionsOpt.h" @@ -13,7 +14,7 @@ #include "core/TensorUtils.hpp" #include "core/Concurrency.h" namespace MNN { -CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* convParam) +CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* convParam, float inputScale, float outputScale) : CPUConvolution(convParam->common(), backend) { const auto convCommon = convParam->common(); const auto kx = convCommon->kernelX(); @@ -25,11 +26,12 @@ CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* c const auto srcCountUnit = UP_DIV(srcCount, GEMM_INT8_UNIT); const auto totalKernelCountUnit = srcCountUnit * kernelCount; - mResource.reset(new CPUConvArm82Int8::Resource); + mResource.reset(new CPUConvInt8::ResourceInt8); + mResource->mInputScale = inputScale; + mResource->mOutputScale = outputScale; mResource->backend = backend; mResource->mWeightInt8.reset(Tensor::createDevice({outputCountUnit, totalKernelCountUnit, GEMM_INT8_UNIT, GEMM_INT8_UNIT})); - auto weightSrc = convParam->symmetricQuan()->weight()->data(); auto allocRes = backend->onAcquireBuffer(mResource->mWeightInt8.get(), Backend::STATIC); if (!allocRes) { mValid = false; @@ -37,10 +39,27 @@ CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* c } const int weightOutputChannelStride = mResource->mWeightInt8->stride(0); + mResource->mBiasInt32.reset(Tensor::createDevice({outputCountUnit * GEMM_INT8_UNIT})); + allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC); + if (!allocRes) { + mValid = false; + return; + } + mResource->mScaleFloat.reset(Tensor::createDevice({outputCountUnit * GEMM_INT8_UNIT})); + allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC); + if (!allocRes) { + mValid = false; + return; + } + auto biasPtr = mResource->mBiasInt32->host(); + auto scalePtr = mResource->mScaleFloat->host(); + memset(biasPtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(int32_t)); + memset(scalePtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(float)); + const int8_t* weightSrc = nullptr; + std::shared_ptr quanCommon; - if (convParam->quanParameter() != nullptr) { - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false); - weightSrc = quanCommon->weight.get(); + if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) { + return; } auto weightDst = mResource->mWeightInt8->host(); memset(weightDst, 0, mResource->mWeightInt8->size()); @@ -64,48 +83,16 @@ CPUConvArm82Int8::CPUConvArm82Int8(Backend* backend, const MNN::Convolution2D* c } } } - - mResource->mBiasInt32.reset(Tensor::createDevice({outputCountUnit * GEMM_INT8_UNIT})); - allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC); - if (!allocRes) { - mValid = false; - return; - } - auto biasPtr = mResource->mBiasInt32->host(); - memset(biasPtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(int32_t)); - memcpy(biasPtr, convParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t)); - - mResource->mScaleFloat.reset(Tensor::createDevice({outputCountUnit * GEMM_INT8_UNIT})); - allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC); - if (!allocRes) { - mValid = false; - return; - } - - auto scalePtr = mResource->mScaleFloat->host(); - memset(scalePtr, 0, outputCountUnit * GEMM_INT8_UNIT * sizeof(float)); - memcpy(scalePtr, convParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float)); mRelu = convCommon->relu() || convCommon->relu6(); } -CPUConvArm82Int8::CPUConvArm82Int8(std::shared_ptr res, Backend* backend, const MNN::Convolution2DCommon* convCommon) : CPUConvolution(convCommon, backend) { +CPUConvArm82Int8::CPUConvArm82Int8(std::shared_ptr res, Backend* backend, const MNN::Convolution2DCommon* convCommon) : CPUConvolution(convCommon, backend) { mResource = res; mRelu = convCommon->relu() || convCommon->relu6(); } -CPUConvArm82Int8::Resource::~Resource() { - if(mWeightInt8 != nullptr){ - backend->onReleaseBuffer(mWeightInt8.get(), Backend::STATIC); - } - if(mBiasInt32 != nullptr){ - backend->onReleaseBuffer(mBiasInt32.get(), Backend::STATIC); - } - if(mScaleFloat != nullptr){ - backend->onReleaseBuffer(mScaleFloat.get(), Backend::STATIC); - } -} - ErrorCode CPUConvArm82Int8::onResize(const std::vector& inputs, const std::vector& outputs) { + mResource->updateInputOutputScale(TensorUtils::getScale(inputs[0]), TensorUtils::getScale(outputs[0])); CPUConvolution::onResize(inputs, outputs); auto input = inputs[0]; auto output = outputs[0]; diff --git a/source/backend/cpu/CPUConvArm82Int8.hpp b/source/backend/cpu/CPUConvArm82Int8.hpp index 37198cac..94d49aa2 100644 --- a/source/backend/cpu/CPUConvArm82Int8.hpp +++ b/source/backend/cpu/CPUConvArm82Int8.hpp @@ -7,21 +7,15 @@ // #ifndef CPUConvArm82Int8_hpp #define CPUConvArm82Int8_hpp -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) #include "backend/cpu/CPUConvolution.hpp" +#include "backend/cpu/CPUConvInt8.hpp" #include namespace MNN { class CPUConvArm82Int8 : public CPUConvolution { public: - struct Resource { - std::shared_ptr mWeightInt8; - std::shared_ptr mBiasInt32; - std::shared_ptr mScaleFloat; - Backend* backend; - ~ Resource(); - }; - CPUConvArm82Int8(Backend *backend, const MNN::Convolution2D *convParam); - CPUConvArm82Int8(std::shared_ptr res, Backend* backend, const MNN::Convolution2DCommon* common); + CPUConvArm82Int8(Backend *backend, const MNN::Convolution2D *convParam, float inputScale, float outputScale); + CPUConvArm82Int8(std::shared_ptr res, Backend* backend, const MNN::Convolution2DCommon* common); virtual ~CPUConvArm82Int8() { // Do nothing @@ -33,9 +27,7 @@ public: private: // relu or relu6 bool mRelu; - std::shared_ptr mResource; - - + std::shared_ptr mResource; ConvolutionCommon::Im2ColParameter mIm2ColParamter; int mTileCount; int mThreadNums; diff --git a/source/backend/cpu/CPUConvInt8.cpp b/source/backend/cpu/CPUConvInt8.cpp index 3e27c86e..548665d0 100644 --- a/source/backend/cpu/CPUConvInt8.cpp +++ b/source/backend/cpu/CPUConvInt8.cpp @@ -10,6 +10,7 @@ #ifdef MNN_USE_ONEDNN #include "backend/cpu/OneDNNConvInt8.hpp" #endif +// MNNGemmInt8AddBiasScale_ARMV82_Unit.S is only available when arm64 now, so don't change this #if defined(__aarch64__) && defined(ENABLE_ARMV82) #include "backend/cpu/CPUConvArm82Int8.hpp" #endif @@ -24,6 +25,7 @@ #include #include "compute/ConvInt83x3.hpp" #include "compute/ConvolutionWinograd.hpp" +#include "compute/WinogradOptFunction.hpp" #ifdef MNN_USE_SSE extern "C" { void MNNInt8ToUInt8(void* ptr, int count); @@ -150,6 +152,33 @@ static void _im2colCommon(int8_t* colAddr, const int8_t* inputOrigin, const int8 } } } +void CPUConvInt8::ResourceInt8::updateInputOutputScale(float inputScale, float outputScale) { + if (inputScale == 0.f || outputScale == 0.f) { + return; + } + if (mInputScale == inputScale && mOutputScale == outputScale) { + return; + } + auto scalePtr = mScaleFloat->host(); + auto biasPtr = mBiasInt32->host(); + int size = mScaleFloat->elementSize(); + float is = mInputScale / inputScale; + float os = mOutputScale / outputScale; + for (int i = 0; i < size; i++) { + scalePtr[i] = scalePtr[i] * os / is; +#ifdef MNN_USE_SSE + if (offsets.empty()) { + biasPtr[i] = static_cast(biasPtr[i] * is); + } else { + biasPtr[i] = static_cast((biasPtr[i] - offsets[i]) * is + offsets[i]); + } +#else + biasPtr[i] = static_cast(biasPtr[i] * is); +#endif + } + mInputScale = inputScale; + mOutputScale = outputScale; +} CPUConvInt8::ResourceInt8::~ResourceInt8() { if(mWeightInt8 != nullptr) { backend->onReleaseBuffer(mWeightInt8.get(), Backend::STATIC); @@ -170,9 +199,12 @@ CPUConvInt8::CPUConvInt8(Backend* backend, const Convolution2DCommon* common, st : CPUConvolution(common, backend) { mResource = res; } -std::shared_ptr CPUConvInt8::makeResource(Backend* backend, const MNN::Convolution2D *convParam) { +std::shared_ptr CPUConvInt8::makeResource(Backend* backend, const MNN::Convolution2D *convParam, + float inputScale, float outputScale) { std::shared_ptr resource(new ResourceInt8); resource->backend = backend; + resource->mInputScale = inputScale; + resource->mOutputScale = outputScale; const auto convCommon = convParam->common(); const auto kx = convCommon->kernelX(); const auto ky = convCommon->kernelY(); @@ -198,28 +230,35 @@ std::shared_ptr CPUConvInt8::makeResource(Backend* ba #endif resource->mActBits = convParam->symmetricQuan()->nbits(); resource->mWeightInt8.reset(Tensor::createDevice({outputCountUnit, totalKernelCountD8Div2, GEMM_INT8_UNIT, GEMM_INT8_SRC_UNIT})); - auto weightSrc = convParam->symmetricQuan()->weight()->data(); auto allocRes = backend->onAcquireBuffer(resource->mWeightInt8.get(), Backend::STATIC); if (!allocRes) { return nullptr; } const int oneTileLen = resource->mWeightInt8->stride(1); const int outputChnnelStride = resource->mWeightInt8->stride(0); - std::shared_ptr quanCommon; - if (convParam->quanParameter() != nullptr) { - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false); - weightSrc = quanCommon->weight.get(); - } + const int outputChannleUp4 = ALIGN_UP4(outputCount); resource->mBiasInt32.reset(Tensor::createDevice({outputChannleUp4})); allocRes = backend->onAcquireBuffer(resource->mBiasInt32.get(), Backend::STATIC); if (!allocRes) { return nullptr; } + resource->mScaleFloat.reset(Tensor::createDevice({outputChannleUp4})); + allocRes = backend->onAcquireBuffer(resource->mScaleFloat.get(), Backend::STATIC); + if (!allocRes) { + return nullptr; + } auto biasPtr = resource->mBiasInt32->host(); memset(biasPtr, 0, outputChannleUp4 * sizeof(int32_t)); - memcpy(biasPtr, convParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t)); + auto scalePtr = resource->mScaleFloat->host(); + memset(scalePtr, 0, outputChannleUp4 * sizeof(float)); + const int8_t* weightSrc = nullptr; + std::shared_ptr quanCommon; + if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) { + return nullptr; + } #ifdef MNN_USE_SSE + resource->offsets.resize(outputCount); // For SSE use uint8_t, int8_t -> uint8_t, x + 128 -> x', x * w + b = (x' - 128) * w + b = x' * w + (-128 * w) + b for (int x = 0; x < outputCount; ++x) { const auto srcX = weightSrc + x * kernelCount * srcCount; @@ -227,6 +266,7 @@ std::shared_ptr CPUConvInt8::makeResource(Backend* ba for (int k = 0; k < kernelCount * srcCount; ++k) { offset += (int)srcX[k] * -128; } + resource->offsets[x] = offset; biasPtr[x] = biasPtr[x] + offset; } #endif @@ -253,16 +293,6 @@ std::shared_ptr CPUConvInt8::makeResource(Backend* ba } } } - resource->mScaleFloat.reset(Tensor::createDevice({outputChannleUp4})); - allocRes = backend->onAcquireBuffer(resource->mScaleFloat.get(), Backend::STATIC); - if (!allocRes) { - return nullptr; - } - - auto scalePtr = resource->mScaleFloat->host(); - memset(scalePtr, 0, outputChannleUp4 * sizeof(float)); - memcpy(scalePtr, convParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float)); - resource->mInputZeroPoint = convParam->symmetricQuan()->zeroPoint(); resource->mOutputZeroPoint = convParam->symmetricQuan()->outputZeroPoint(); resource->mClampMin = convParam->symmetricQuan()->clampMin(); @@ -281,6 +311,7 @@ bool CPUConvInt8::onClone(Backend* bn, const Op* op, Execution** dst) { } ErrorCode CPUConvInt8::onResize(const std::vector& inputs, const std::vector& outputs) { + mResource->updateInputOutputScale(TensorUtils::getScale(inputs[0]), TensorUtils::getScale(outputs[0])); CPUConvolution::onResize(inputs, outputs); auto input = inputs[0]; auto output = outputs[0]; @@ -449,9 +480,15 @@ class CPUConvInt8Creator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { + float inputScale = 0.0f; + float outputScale = 0.0f; + if (inputs.size() > 0) { + inputScale = TensorUtils::getScale(inputs[0]); + outputScale = TensorUtils::getScale(outputs[0]); + } #if defined(__aarch64__) && defined(ENABLE_ARMV82) if(static_cast(backend)->supportDot()){ - return new CPUConvArm82Int8(backend, op->main_as_Convolution2D()); + return new CPUConvArm82Int8(backend, op->main_as_Convolution2D(), inputScale, outputScale); } #endif #ifdef MNN_USE_ONEDNN @@ -473,12 +510,12 @@ public: return new ConvInt83x3(backend, op->main_as_Convolution2D(), inputs, outputs); } } else if (((kx == 1 && ky != 1) || (kx != 1 && ky == 1)) && weightBits <= 7 && actBits <= 7) { - return new ConvInt8_1xN(backend, op->main_as_Convolution2D()); + return new ConvInt8_1xN(backend, op->main_as_Convolution2D(), inputScale, outputScale); } } } #endif - auto resource = CPUConvInt8::makeResource(backend, op->main_as_Convolution2D()); + auto resource = CPUConvInt8::makeResource(backend, op->main_as_Convolution2D(), inputScale, outputScale); if (nullptr == resource) { MNN_ERROR("Error for alloc memory when create CPUConvInt8\n"); return nullptr; diff --git a/source/backend/cpu/CPUConvInt8.hpp b/source/backend/cpu/CPUConvInt8.hpp index af2a2705..8cb8ecf2 100644 --- a/source/backend/cpu/CPUConvInt8.hpp +++ b/source/backend/cpu/CPUConvInt8.hpp @@ -31,21 +31,28 @@ public: int8_t mClampMin; int8_t mClampMax; Backend* backend; - + float mInputScale; + float mOutputScale; +#ifdef MNN_USE_SSE + std::vector offsets; +#endif + void updateInputOutputScale(float inputScale, float outputScale); ~ ResourceInt8(); }; CPUConvInt8(Backend *backend, const Convolution2DCommon* common, std::shared_ptr resource); virtual ~CPUConvInt8(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - static std::shared_ptr makeResource(Backend *backend, const MNN::Convolution2D *convOp); + static std::shared_ptr makeResource(Backend *backend, const MNN::Convolution2D *convOp, + float inputScale, float outputScale); virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; private: std::shared_ptr mResource; ConvolutionCommon::Im2ColParameter mIm2ColParamter; int mTileCount; int mThreadNums; - + float mInputScale; + float mOutputScale; Tensor mTempIm2ColBuffer; }; diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp index 93cdb904..e6ac755a 100644 --- a/source/backend/cpu/CPUConvolution.cpp +++ b/source/backend/cpu/CPUConvolution.cpp @@ -18,8 +18,31 @@ namespace MNN { +bool CPUConvolution::Resource::copyBiasAlign(const float* bias, int outputCount) { + auto core = static_cast(backend)->functions(); + int bytes = core->bytes; + int unit = core->pack; + auto alignOutput = UP_DIV(outputCount, unit) * unit; + int remain = alignOutput - outputCount; + mBias.reset(Tensor::createDevice(std::vector{alignOutput * bytes})); + bool success = backend->onAcquireBuffer(mBias.get(), Backend::STATIC); + if (!success) { + MNN_ERROR("Error for alloc memory for Alloc Bias\n"); + return false;; + } + if (bytes < 4) { + core->MNNFp32ToLowp(bias, mBias->host(), outputCount); + } else { + ::memcpy(mBias->host(), bias, outputCount * bytes); + } + if (remain > 0) { + ::memset(mBias->host() + outputCount * bytes, 0, remain * bytes); + } + return true; +} + CPUConvolution::CPUConvolution(const Convolution2DCommon *convOp, Backend *b) : MNN::Execution(b), mCommon(convOp) { - mPostFunction = getPostFunction(); + // Do nothing } std::vector CPUConvolution::getPostParameters() const { std::vector postParameters = { @@ -68,6 +91,7 @@ void CPUConvolution::reorderWeightSlow(T* dest, const T* source, size_t depth, s } template void CPUConvolution::reorderWeightSlow(int8_t*, const int8_t*, size_t, size_t, size_t, size_t, size_t, bool); +template void CPUConvolution::reorderWeightSlow(int16_t*, const int16_t*, size_t, size_t, size_t, size_t, size_t, bool); // FLOAT16(__fp16) is not available here, so use int16_t (2 byte also) template // T -> U bool CPUConvolution::acquireMemoryAndCopy(std::shared_ptr dest, const T* source, size_t count, Backend* backend) { @@ -86,6 +110,7 @@ bool CPUConvolution::acquireMemoryAndCopy(std::shared_ptr dest, const T* template bool CPUConvolution::acquireMemoryAndCopy(std::shared_ptr, const int32_t*, size_t, Backend*); template bool CPUConvolution::acquireMemoryAndCopy(std::shared_ptr, const float*, size_t, Backend*); + ErrorCode CPUConvolution::onResize(const std::vector &inputs, const std::vector &outputs) { auto input = inputs[0]; auto output = outputs[0]; @@ -95,16 +120,6 @@ ErrorCode CPUConvolution::onResize(const std::vector &inputs, const st return NO_ERROR; } -CPUConvolution::POSTFUNCTION CPUConvolution::getPostFunction() const { - if (mCommon->relu()) { - return MNNAddBiasRelu; - } - if (mCommon->relu6()) { - return MNNAddBiasRelu6; - } - return MNNAddBias; -} - class ConvolutionFactory : public CPUBackend::Creator { public: virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp index 223d76f0..b7a6936c 100644 --- a/source/backend/cpu/CPUConvolution.hpp +++ b/source/backend/cpu/CPUConvolution.hpp @@ -18,6 +18,7 @@ public: std::shared_ptr mWeight; std::shared_ptr mBias; Backend* backend; + bool copyBiasAlign(const float* bias, int outputCount); ~ Resource() { if (nullptr != mBias) { backend->onReleaseBuffer(mBias.get(), Backend::STATIC); @@ -31,12 +32,12 @@ public: virtual ~CPUConvolution() = default; virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - typedef void (*POSTFUNCTION)(float *dst, const float *bias, size_t planeNumber, size_t biasNumber); - - POSTFUNCTION getPostFunction() const; - static int reorderWeightSize(int depth, int outputCount, int kernelSize, int unitDepth, int unitOC); // Inefficient but need not cache, use it when speed insensitive (init, onResize) + // source shape: [outputCount, depth, kernelSize] + // dest shape: + // transpose=false: [UP_DIV(outputCount,unitOC), UP_DIV(depth,unitDepth), kernelSize, unitDepth, unitOC] + // transpose=true: [UP_DIV(outputCount,unitOC), UP_DIV(depth,unitDepth), kernelSize, unitOC, unitDepth] template static void reorderWeightSlow(T* dest, const T* source, size_t depth, size_t outputCount, size_t kernelSize, size_t unitDepth, size_t unitOC, bool transpose = false); /* Inefficient because of not use memcpy to support different type copy (T -> U), use it when speed insensitive (init, onResize) @@ -51,7 +52,6 @@ protected: // In execute, use pad from mPadX and mPadY, don't use mCommon's pad mutable int mPadX; mutable int mPadY; - CPUConvolution::POSTFUNCTION mPostFunction; }; } // namespace MNN diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp index bd918b8f..affdd2cd 100644 --- a/source/backend/cpu/CPUConvolutionDepthwise.cpp +++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp @@ -15,43 +15,6 @@ #include "backend/cpu/compute/CommonOptFunction.h" #include "backend/cpu/compute/ConvOpt.h" #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp" -static const int gIntUnit = 4; -extern "C" { -void MNNConvRunForLineDepthWiseInt8(float* dst, const int8_t* src, const int8_t* weight, size_t width, - size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, - const float* alpha_z); -} - -#ifndef MNN_USE_NEON -void MNNConvRunForLineDepthWiseInt8(float* dst, const int8_t* src, const int8_t* weight, size_t width, - size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, - const float* alpha_z) { - int dx, fx, fy; - for (dx = 0; dx < width; ++dx) { - float* dst_x = dst + dx * 4; - dst_x[0] = 0.0f; - dst_x[1] = 0.0f; - dst_x[2] = 0.0f; - dst_x[3] = 0.0f; - auto src_z = src + src_w_setup * dx; - auto weight_z = weight; - for (fy = 0; fy < fh; ++fy) { - auto src_y = src_z + fy * dilateY_step; - auto weight_y = weight_z + fy * fw * 4; - for (fx = 0; fx < fw; ++fx) { - auto weight_x = weight_y + 4 * fx; - auto src_x = src_y + fx * dilateX_step; - for (int j = 0; j < 4; ++j) { - dst_x[j] += (float)src_x[j] * (float)weight_x[j]; - } - } - } - for (int i = 0; i < 4; ++i) { - dst_x[i] *= alpha_z[i]; - } - } -} -#endif namespace MNN { CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b, @@ -62,28 +25,40 @@ CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommo mOrigin.reset(new BasicFloatExecution(common, b)); mResource.reset(new Resource); mResource->backend = backend(); + auto core = static_cast(b)->functions(); + int bytes = core->bytes; + int unit = core->pack; int kw = layer->kernelX(); int kh = layer->kernelY(); int outputCount = (int)biasSize; - mResource->mBias.reset(Tensor::createDevice(std::vector{ALIGN_UP4(outputCount)})); - int depthQuad = UP_DIV(outputCount, 4); - int kernelSize = depthQuad * 4 * kw * kh; - mResource->mWeight.reset(Tensor::createDevice(std::vector{kernelSize})); - bool success = - b->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC) && b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); + int depthQuad = UP_DIV(outputCount, unit); + int kernelSize = depthQuad * unit * kw * kh; + mResource->mWeight.reset(Tensor::createDevice(std::vector{kernelSize * bytes})); + bool success = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); if (!success) { MNN_ERROR("Error for alloc memory for CPUConvolutionDepthwise\n"); mValid = false; return; } - ::memset(mResource->mBias->host(), 0, mResource->mBias->size()); - ::memcpy(mResource->mBias->host(), bias, biasSize * sizeof(float)); - + success = mResource->copyBiasAlign(bias, biasSize); + if (!success) { + mValid = false; + return; + } const float* tempWeight = originWeight; // Reorder weight from whc -> pwhc4 - ::memset(mResource->mWeight->host(), 0, kernelSize * sizeof(float)); auto weight = mResource->mWeight->host(); - MNNPackC4(weight, tempWeight, kh * kw, outputCount); + if (bytes < 4) { + AutoStorage tempW(kh * kw * outputCount * bytes); + if (tempW.get() == nullptr) { + mValid = false; + return; + } + core->MNNFp32ToLowp(tempWeight, (int16_t*)tempW.get(), kh * kw * outputCount); + core->MNNPackCUnit(weight, (const float*)tempW.get(), kh * kw, outputCount); + } else { + core->MNNPackCUnit(weight, tempWeight, kh * kw, outputCount); + } } CPUConvolutionDepthwise::FloatExecution::~FloatExecution() { // Do nothing @@ -102,12 +77,18 @@ ErrorCode CPUConvolutionDepthwise::MultiInputFloatExecution::onResize(const std: auto layer = mCommon; auto kw = layer->kernelX(); auto kh = layer->kernelY(); - - mWeight.reset(Tensor::createDevice({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4})); - mBias.reset(Tensor::createDevice({ALIGN_UP4(inputs[0]->channel())})); + auto core = static_cast(backend())->functions(); + int bytes = core->bytes; + int unit = core->pack; + auto ic4 = UP_DIV(inputs[0]->channel(), unit); + mWeight.reset(Tensor::createDevice({ic4, kh, kw, unit * bytes})); + mBias.reset(Tensor::createDevice({ic4 * unit * bytes})); mTempInputs = {inputs[0], mWeight.get(), mBias.get()}; - backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC); - backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC); + bool success = backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC); + success = success && backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC); + if (!success) { + return OUT_OF_MEMORY; + } auto code = CPUConvolutionDepthwise::BasicFloatExecution::onResize(mTempInputs, outputs); backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC); @@ -118,16 +99,18 @@ ErrorCode CPUConvolutionDepthwise::MultiInputFloatExecution::onExecute(const std const std::vector& outputs) { auto kh = mWeight->length(1); auto kw = mWeight->length(2); - ::memset(mBias->host(), 0, mBias->size()); - if (inputs.size() > 2) { - ::memcpy(mBias->host(), inputs[2]->host(), inputs[2]->size()); - } // Reorder weight from whc -> pwhc4 - ::memset(mWeight->host(), 0, mWeight->size()); auto outputCount = inputs[0]->channel(); auto weight = mWeight->host(); auto tempWeight = inputs[1]->host(); - MNNPackC4(weight, tempWeight, kh * kw, outputCount); + auto core = static_cast(backend())->functions(); + int bytes = core->bytes; + int unit = core->pack; + core->MNNPackCUnit(weight, tempWeight, kh * kw, outputCount); + ::memset(mBias->host(), 0, mBias->size()); + if (inputs.size() > 2) { + ::memcpy(mBias->host(), inputs[2]->host(), outputCount * bytes); + } return CPUConvolutionDepthwise::BasicFloatExecution::onExecute(mTempInputs, outputs); } @@ -135,28 +118,34 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect const std::vector& outputs) { CPUConvolution::onResize(inputs, outputs); auto layer = mCommon; + auto core = static_cast(backend())->functions(); + int bytes = core->bytes; + int unit = core->pack; + auto unitFunc = core->MNNConvRunForUnitDepthWise; + auto lineFunc = core->MNNConvRunForLineDepthwise; + auto postFunc = core->MNNAxByClampBroadcastUnit; auto inputTensor = inputs[0]; auto outputTensor = outputs[0]; int src_width = inputTensor->width(); int src_height = inputTensor->height(); int dst_width = outputTensor->width(); int dst_height = outputTensor->height(); - int dst_depth_quad = UP_DIV(layer->outputCount(), 4); - int dst_z_step = dst_width * dst_height * 4; - int src_z_step = src_width * src_height * 4; - int dst_y_step = dst_width * 4; - int src_y_step = src_width * 4; + int dst_depth_quad = UP_DIV(layer->outputCount(), unit); + int dst_z_step = dst_width * dst_height * unit; + int src_z_step = src_width * src_height * unit; + int dst_y_step = dst_width * unit; + int src_y_step = src_width * unit; int strideY = layer->strideY(); int strideX = layer->strideX(); int dilateX = layer->dilateX(); int dilateY = layer->dilateY(); - int dilateY_step = dilateY * src_width * 4; - int dilateX_step = dilateX * 4; + int dilateY_step = dilateY * src_width * unit; + int dilateX_step = dilateX * unit; int kernel_height = layer->kernelY(); int kernel_width = layer->kernelX(); int padX = mPadX; int padY = mPadY; - int weight_z_step = kernel_height * kernel_width * 4; + int weight_z_step = kernel_height * kernel_width * unit; // Compute Mid Rect int l = 0, t = 0, r = dst_width, b = dst_height; for (; l * strideX - padX < 0 && l < dst_width; l++) { @@ -172,46 +161,48 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect // do nothing } - auto postFunction = getPostFunction(); + auto postData = getPostParameters(); int numberThread = std::min(((CPUBackend*)backend())->threadNumber(), dst_depth_quad); - auto runBasic = [=](float* dst_z, const float* src_z, const float* weight_dz, int L, int T, int R, int B) { + auto runBasic = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) { for (int dy = T; dy < B; ++dy) { - float* dst_y = dst_z + dy * dst_y_step; + auto dst_y = dst_z + dy * dst_y_step * bytes; int srcStartY = dy * strideY - padY; - const float* src_dy = src_z + srcStartY * src_y_step; + const auto src_dy = src_z + srcStartY * src_y_step * bytes; int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); for (int dx = L; dx < R; ++dx) { - float* dst_x = dst_y + 4 * dx; + auto dst_x = dst_y + unit * dx * bytes; int srcStartX = dx * strideX - padX; - const float* src_dx = src_dy + srcStartX * 4; + const auto src_dx = src_dy + srcStartX * unit * bytes; int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); - MNNConvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4, - weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, - 4 * kernel_width, dilateX_step, dilateY_step); + unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes), + (const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy, + unit * kernel_width, dilateX_step, dilateY_step); } } }; - auto bias = inputs[2]; - auto weight = inputs[1]; - mExecutor = [=](const float* srcOrigin, float* dstOrigin, int tId) { - for (int dz = tId; dz < dst_depth_quad; dz += numberThread) { - float* dst_z = dstOrigin + dst_z_step * dz; - const float* src_z = srcOrigin + src_z_step * dz; - float* bias_z = bias->host() + 4 * dz; - const float* weight_dz = weight->host() + dz * weight_z_step; + auto biasP = inputs[2]->host(); + auto weightP = inputs[1]->host(); + int total = inputs[0]->batch() * dst_depth_quad; + mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) { + for (int index = tId; index < total; index += numberThread) { + int dz = index % dst_depth_quad; + auto dst_z = dstOrigin + dst_z_step * index * bytes; + const auto src_z = srcOrigin + src_z_step * index * bytes; + auto bias_z = biasP + unit * dz * bytes; + const auto weight_dz = weightP + dz * weight_z_step * bytes; runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t); runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height); runBasic(dst_z, src_z, weight_dz, 0, t, l, b); runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b); if (r > l && b > t) { - MNNConvRunForLineDepthwise(dst_z + t * dst_y_step + l * 4, - src_z + (t * strideY - padY) * src_y_step + (l * strideX - padX) * 4, - weight_dz, r - l, strideX * 4, kernel_width, kernel_height, dilateX_step, + lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes), + (const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes), + (const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, b - t, src_y_step * strideY, dst_y_step); } - postFunction(dst_z, bias_z, dst_width * dst_height, 1); + postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data()); } }; mNumber = numberThread; @@ -223,185 +214,14 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onExecute(const std::vec const std::vector& outputs) { auto inputTensor = inputs[0]; auto outputTensor = outputs[0]; - for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) { - const float* srcOrigin = inputTensor->host() + batchIndex * inputTensor->stride(0); - float* dstOrigin = outputTensor->host() + batchIndex * outputTensor->stride(0); - MNN_CONCURRENCY_BEGIN(tId, mNumber) { - mExecutor(srcOrigin, dstOrigin, (int)tId); - } - MNN_CONCURRENCY_END(); + const auto srcOrigin = inputTensor->host(); + auto dstOrigin = outputTensor->host(); + MNN_CONCURRENCY_BEGIN(tId, mNumber) { + mExecutor(srcOrigin, dstOrigin, (int)tId); } - + MNN_CONCURRENCY_END(); return NO_ERROR; } - -CPUConvolutionDepthwise::Int8Execution::Int8Execution(const Convolution2DCommon* convOp, Backend* b, - const ConvolutionCommon::Int8Common* common, - const float* bias, size_t biasSize) - : MNN::CPUConvolution(convOp, b) { - mQuan = common->quan; - MNN_ASSERT(nullptr != mQuan); - mBias.reset(ALIGN_UP4((int)biasSize)); - mBias.clear(); - ::memcpy(mBias.get(), bias, biasSize * sizeof(float)); - - mAlpha.reset(ALIGN_UP4((int)biasSize)); - mAlpha.clear(); - ::memcpy(mAlpha.get(), common->alpha.get(), biasSize * sizeof(float)); - - auto layer = mCommon; - int kx = layer->kernelX(); - int ky = layer->kernelY(); - - int outputCount = (int)biasSize; - int dstCountD8 = UP_DIV(outputCount, gIntUnit); - - int cur = 0; - mWeight.reset(dstCountD8 * gIntUnit * kx * ky); - mWeight.clear(); - int8_t* reorderedWeight = mWeight.get(); - auto originWeight = common->weight.get(); - for (int dz = 0; dz < outputCount; ++dz) { - int dzD8 = dz / gIntUnit; - int my = dz % gIntUnit; - auto dstDz = reorderedWeight + dzD8 * kx * ky * gIntUnit; - - for (int i = 0; i < kx * ky; ++i) { - auto index = i * gIntUnit; - dstDz[index + my] = originWeight[cur++]; - } - } -} - -ErrorCode CPUConvolutionDepthwise::Int8Execution::onResize(const std::vector& inputs, - const std::vector& outputs) { - auto result = CPUConvolution::onResize(inputs, outputs); - auto originInput = inputs[0]; - auto& ib = mInputTempBuffer.buffer(); - ib.type = halide_type_of(); - ib.dim[0].extent = UP_DIV(originInput->channel(), gIntUnit); - ib.dim[3].extent = gIntUnit; - ib.dim[1].extent = originInput->height(); - ib.dim[2].extent = originInput->width(); - TensorUtils::setLinearLayout(&mInputTempBuffer); - - backend()->onAcquireBuffer(&mInputTempBuffer, Backend::DYNAMIC); - backend()->onReleaseBuffer(&mInputTempBuffer, Backend::DYNAMIC); - - auto layer = mCommon; - auto inputTensor = inputs[0]; - auto outputTensor = outputs[0]; - int src_width = inputTensor->width(); - int src_height = inputTensor->height(); - int dst_width = outputTensor->width(); - int dst_height = outputTensor->height(); - int dst_depth_quad = UP_DIV(layer->outputCount(), gIntUnit); - int dst_z_step = dst_width * dst_height * gIntUnit; - int src_z_step = mInputTempBuffer.buffer().dim[0].stride; - int dst_y_step = dst_width * gIntUnit; - int src_y_step = src_width * gIntUnit; - int strideY = layer->strideY(); - int strideX = layer->strideX(); - int dilateX = layer->dilateX(); - int dilateY = layer->dilateY(); - int dilateY_step = dilateY * src_width * gIntUnit; - int dilateX_step = dilateX * gIntUnit; - int kernel_height = layer->kernelY(); - int kernel_width = layer->kernelX(); - int padX = mPadX; - int padY = mPadY; - int weight_z_step = kernel_height * kernel_width * gIntUnit; - - // Compute Mid Rect - int l = 0, t = 0, r = dst_width, b = dst_height; - for (; l * strideX - padX < 0 && l < dst_width; l++) { - // do nothing - } - for (; t * strideY - padY < 0 && t < dst_height; t++) { - // do nothing - } - for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) { - // do nothing - } - for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) { - // do nothing - } - - auto postFunction = getPostFunction(); - for (int i=0; i<4; ++i) { - mQuanScale[i] = mQuan->quantScale(); - } - int8_t zeroPoint = 0; - - auto runBasic = [=](float* dst_z, const int8_t* src_z, const int8_t* weight_dz, const float* alpha_z, int L, int T, - int R, int B) { - for (int dy = T; dy < B; ++dy) { - float* dst_y = dst_z + dy * dst_y_step; - int srcStartY = dy * strideY - padY; - auto src_dy = src_z + srcStartY * src_y_step; - int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); - int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); - for (int dx = L; dx < R; ++dx) { - float* dst_x = dst_y + 4 * dx; - int srcStartX = dx * strideX - padX; - auto src_dx = src_dy + srcStartX * 4; - int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); - int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); - MNNConvRunForUnitDepthWiseInt8(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4, - weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, - 4 * kernel_width, dilateX_step, dilateY_step, alpha_z); - } - } - }; - auto aMin = mQuan->aMin(); - auto aMax = mQuan->aMax(); - mRun = [=]() { - for (int batchIndex = 0; batchIndex < inputTensor->batch(); ++batchIndex) { - const float* srcOrigin = inputTensor->host() + batchIndex * src_z_step * dst_depth_quad; - float* dstOrigin = outputTensor->host() + batchIndex * dst_z_step * dst_depth_quad; - - MNN_CONCURRENCY_BEGIN(dz, dst_depth_quad) { - float* dst_z_float = dstOrigin + dst_z_step * dz; - const float* src_z_float = srcOrigin + src_z_step * dz; - - auto dst_z = dst_z_float; - auto src_z = (int8_t*)mInputTempBuffer.buffer().host + dz * mInputTempBuffer.buffer().dim[0].stride; - - MNNFloat2Int8(src_z_float, src_z, src_z_step / 4, mQuanScale, aMin, aMax, zeroPoint); - - const float* bias_z = mBias.get() + gIntUnit * dz; - const float* alpha_z = mAlpha.get() + gIntUnit * dz; - const int8_t* weight_dz = mWeight.get() + dz * weight_z_step; - runBasic(dst_z, src_z, weight_dz, alpha_z, 0, 0, dst_width, t); - runBasic(dst_z, src_z, weight_dz, alpha_z, 0, b, dst_width, dst_height); - runBasic(dst_z, src_z, weight_dz, alpha_z, 0, t, l, b); - runBasic(dst_z, src_z, weight_dz, alpha_z, r, t, dst_width, b); - if (r > l) { - for (int dy = t; dy < b; ++dy) { - float* dst_y = dst_z + dy * dst_y_step; - int srcStartY = dy * strideY - padY; - auto src_dy = src_z + srcStartY * src_y_step; - MNNConvRunForLineDepthWiseInt8(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l, - strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step, - alpha_z); - } - } - - postFunction(dst_z_float, bias_z, dst_width * dst_height, 1); - } - MNN_CONCURRENCY_END(); - } - }; - return result; -} - -ErrorCode CPUConvolutionDepthwise::Int8Execution::onExecute(const std::vector& inputs, - const std::vector& outputs) { - - mRun(); - return NO_ERROR; -} - class CPUConvolutionDepthwiseCreator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, @@ -415,10 +235,7 @@ public: size_t originWeightSize = 0; std::shared_ptr quanCommon; if (nullptr != conv2d->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), false); - if (quanCommon->weightFloat.get() == nullptr) { - return new CPUConvolutionDepthwise::Int8Execution(conv2d->common(), backend, quanCommon.get(), conv2d->bias()->data(), conv2d->bias()->size()); - } + quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), true); // Back to float originWeight = quanCommon->weightFloat.get(); originWeightSize = quanCommon->weightFloat.size(); diff --git a/source/backend/cpu/CPUConvolutionDepthwise.hpp b/source/backend/cpu/CPUConvolutionDepthwise.hpp index 9f5c70aa..9b7cbecb 100644 --- a/source/backend/cpu/CPUConvolutionDepthwise.hpp +++ b/source/backend/cpu/CPUConvolutionDepthwise.hpp @@ -25,7 +25,7 @@ public: virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; private: - std::function mExecutor; + std::function mExecutor; int mNumber = 1; }; class MultiInputFloatExecution : public BasicFloatExecution { @@ -64,25 +64,6 @@ public: std::vector mTempInputs; std::unique_ptr mOrigin; }; - - class Int8Execution : public CPUConvolution { - public: - Int8Execution(const Convolution2DCommon *convOp, Backend *b, const ConvolutionCommon::Int8Common *common, - const float *bias, size_t biasSize); - virtual ~Int8Execution() = default; - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - - private: - AutoStorage mWeight; - AutoStorage mBias; - AutoStorage mAlpha; - float mQuanScale[4]; - - Tensor mInputTempBuffer; - const IDSTQuan *mQuan; - std::function mRun; - }; }; } // namespace MNN diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp index 07d81456..bbfb93f2 100644 --- a/source/backend/cpu/CPUDeconvolution.cpp +++ b/source/backend/cpu/CPUDeconvolution.cpp @@ -11,9 +11,9 @@ #include "CPUBackend.hpp" #include "core/Concurrency.h" #include "core/Macro.h" +#include "core/AutoStorage.h" #include "math/Matrix.hpp" #include "core/TensorUtils.hpp" -#include "math/Vec.hpp" #include "core/ConvolutionCommon.hpp" #include "compute/CommonOptFunction.h" #include "compute/ConvOpt.h" @@ -21,11 +21,11 @@ //#define MNN_OPEN_TIME_TRACE #include -using Vec4 = MNN::Math::Vec; namespace MNN { CPUDeconvolutionBasic::CPUDeconvolutionBasic(const Tensor* input, const Op* convOp, Backend* b) : CPUConvolution(convOp->main_as_Convolution2D()->common(), b) { mSrcCount = input->channel(); + mPostParameters = getPostParameters(); } ErrorCode CPUDeconvolutionBasic::onResize(const std::vector& inputs, const std::vector& outputs) { @@ -41,36 +41,42 @@ CPUDeconvolutionCommon::CPUDeconvolutionCommon(const Tensor* input, const Op* co : CPUDeconvolutionBasic(input, convOp, b) { auto conv2D = convOp->main_as_Convolution2D(); int outputCount = mCommon->outputCount(); - mBias.reset(Tensor::createDevice(std::vector{ALIGN_UP4(outputCount)})); + auto core = static_cast(b)->functions(); + mBias.reset(Tensor::createDevice(std::vector{UP_DIV(outputCount, core->pack) * core->pack})); bool success = b->onAcquireBuffer(mBias.get(), Backend::STATIC); if (!success) { mValid = false; return; } - ::memset(mBias->host(), 0, mBias->size()); - ::memcpy(mBias->host(), conv2D->bias()->data(), conv2D->bias()->size() * sizeof(float)); + ::memset(mBias->host(), 0, mBias->length(0) * core->bytes); + if (core->bytes == 4) { + ::memcpy(mBias->host(), conv2D->bias()->data(), conv2D->bias()->size() * sizeof(float)); + } else { + core->MNNFp32ToLowp(conv2D->bias()->data(), mBias->host(), conv2D->bias()->size()); + } } CPUDeconvolutionCommon::~CPUDeconvolutionCommon() { backend()->onReleaseBuffer(mBias.get(), Backend::STATIC); } -static void _transformWeight(const float* tempWeight, float* dest, int outputCount, int srcCount, int fh, int fw, - float* cache) { - auto outputC4 = UP_DIV(outputCount, 4); +static void _transformWeight(const uint8_t* tempWeight, uint8_t* dest, int outputCount, int srcCount, int fh, int fw, + uint8_t* cache, const CoreFunctions* core) { + auto outputC4 = UP_DIV(outputCount, core->pack); // c, n, h, w-> c, n/4 * 4, h, w for (int c=0; cpack * core->bytes; + auto src = tempWeight + c * outputCount * fw * fh * core->bytes; + core->MNNPackCUnit((float*)dst, (const float*)src, fw*fh, outputCount); } //printf("%d - %d - %d - %d\n", outputCount, srcCount, fh, fw); - MNNPackForMatMul_B(dest, cache, outputC4 * fw * fh * 4, srcCount, false); + core->MNNPackForMatMul_B((float*)dest, (const float*)cache, outputC4 * fw * fh * core->pack, srcCount, false); } CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backend* backend) : MNN::CPUDeconvolutionCommon(input, convOp, backend) { auto layer = convOp->main_as_Convolution2D()->common(); + auto core = static_cast(backend)->functions(); const float* tempWeight = nullptr; int tempWeightSize = 0; @@ -81,9 +87,9 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen int fh = layer->kernelY(); int srcCount = mSrcCount; int eP, lP, hP; - MNNGetMatMulPackMode(&eP, &lP, &hP); - auto outputAlign = ALIGN_UP4(layer->outputCount()) * fw * fh; - mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputAlign, hP), srcCount, hP})); + core->MNNGetMatMulPackMode(&eP, &lP, &hP); + auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh; + mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP})); std::shared_ptr cache(Tensor::createDevice({outputAlign * srcCount})); bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) && backend->onAcquireBuffer(cache.get(), Backend::STATIC); @@ -91,10 +97,19 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen mValid = false; return; } - float* dest = mWeight->host(); - MNN_ASSERT(nullptr != dest); + auto dest = mWeight->host(); int outputCount = layer->outputCount(); - _transformWeight(tempWeight, dest, outputCount, srcCount, fh, fw, cache->host()); + AutoStorage lowpWeight; + if (core->bytes < 4) { + lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes); + if (lowpWeight.get() == nullptr) { + mValid = false; + return; + } + core->MNNFp32ToLowp(tempWeight, (int16_t*)lowpWeight.get(), outputCount * srcCount * fh * fw); + tempWeight = (float*)lowpWeight.get(); + } + _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host(), core); backend->onReleaseBuffer(cache.get(), Backend::STATIC); mOrigin.reset(new CPUDeconvolutionOrigin(input, convOp, backend)); } @@ -106,15 +121,16 @@ CPUDeconvolution::~CPUDeconvolution() { ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, const std::vector& outputs) { CPUDeconvolutionBasic::onResize(inputs, outputs); + auto core = static_cast(backend())->functions(); auto input = inputs[0]; auto output = outputs[0]; auto oc = output->channel(); - if (ALIGN_UP4(oc) != inputs[2]->length(0)) { + if (UP_DIV(oc, core->pack) * core->pack != inputs[2]->length(0)) { return INPUT_DATA_ERROR; } - auto ocC4 = UP_DIV(output->channel(), 4); - auto icC4 = UP_DIV(input->channel(), 4); + auto ocC4 = UP_DIV(output->channel(), core->pack); + auto icC4 = UP_DIV(input->channel(), core->pack); auto kw = mCommon->kernelX(); auto kh = mCommon->kernelY(); auto dilateX = mCommon->dilateX(); @@ -133,7 +149,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c mPostFunctions.clear(); auto plane = width * height; const int maxDepth = 5; - std::shared_ptr tempColTotalBuffer(Tensor::createDevice({kernelCount, plane, 4})); + AutoRelease tempColTotalBuffer(Tensor::createDevice({kernelCount, plane, core->pack})); auto res = backend()->onAcquireBuffer(tempColTotalBuffer.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; @@ -141,22 +157,22 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c auto colBufferPtr = tempColTotalBuffer->host(); auto biasPtr = inputs[2]->host(); auto inputPtr = input->host(); - std::shared_ptr tempInputBuffer( - Tensor::create({icC4, plane, 4}, inputPtr)); - std::shared_ptr tempInput(Tensor::createDevice({icC4, plane, 4})); + AutoRelease tempInputBuffer( + Tensor::create({icC4, plane, core->pack}, inputPtr)); + AutoRelease tempInput(Tensor::createDevice({icC4, plane, core->pack})); auto threadNumber = ((CPUBackend*)backend())->threadNumber(); if (input->batch() != 1) { res = backend()->onAcquireBuffer(tempInput.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; } - auto newInputPtr = tempInput->host(); + auto newInputPtr = tempInput->host(); // Copy Batch - mPreFunctions.emplace_back(std::make_pair([newInputPtr, icC4, plane, threadNumber](const float* srcBatch, int tId) { + mPreFunctions.emplace_back(std::make_pair([newInputPtr, icC4, plane, threadNumber, core](const float* srcBatch, int tId) { for (int c = tId; cpack * core->bytes; + auto dstDepth = newInputPtr + c * plane * core->pack * core->bytes; + ::memcpy(dstDepth, srcDepth, plane * core->pack * core->bytes); } }, threadNumber)); } else { @@ -165,12 +181,13 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth)); mMatMul->onEncode({tempInput.get(), inputs[1]}, {tempColTotalBuffer.get()}); mPostFunctions.emplace_back(std::make_pair([colBufferPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY, - strideX, threadNumber, src_width, src_height, plane, biasPtr, this](float* outputPtr, int tId) { + strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core](float* outputPtr, int tId) { + auto unitBytes = core->pack * core->bytes; for (int z = (tId); z < ocC4; z += threadNumber) { - auto dstZ = outputPtr + z * src_height * src_width * 4; - auto srcZ = colBufferPtr + kw * kh * 4 * plane * z; + auto dstZ = (uint8_t*)outputPtr + z * src_height * src_width * unitBytes; + auto srcZ = (uint8_t*)colBufferPtr + kw * kh * plane * z * unitBytes; auto dstB = dstZ; - ::memset(dstB, 0, 4 * src_width * src_height * sizeof(float)); + ::memset(dstB, 0, src_width * src_height * unitBytes); auto srcB = srcZ; for (int oy = 0; oy < height; ++oy) { for (int ox = 0; ox < width; ++ox) { @@ -183,21 +200,20 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX)); - auto dstStart = dstB + srcStartX * 4 + srcStartY * src_width * 4; - auto srcStart = srcB + 4 * (ox + oy * width); + auto dstStart = dstB + srcStartX * unitBytes + srcStartY * src_width * unitBytes; + auto srcStart = srcB + unitBytes * (ox + oy * width); + if (sfy >= efy || sfx >= efx) { + continue; + } for (int fy = sfy; fy < efy; ++fy) { - auto dstY = dstStart + fy * 4 * dilateY * src_width; - auto srcY = srcStart + fy * kw * plane * 4; - for (int fx = sfx; fx < efx; ++fx) { - auto dstX = dstY + fx * dilateX * 4; - auto srcX = srcY + fx * plane * 4; - Vec4::save(dstX, Vec4::load(dstX) + Vec4::load(srcX)); - } + auto dstY = dstStart + fy * unitBytes * dilateY * src_width; + auto srcY = srcStart + fy * kw * plane * unitBytes; + core->MNNAddC4WithStride((const float*)(srcY + sfx * plane * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), plane * core->pack, dilateX * core->pack, efx - sfx); } } } - mPostFunction(dstZ, biasPtr + 4 * z, src_height * src_width, 1); + core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr + unitBytes * z), src_height * src_width, 0, 0, 1, mPostParameters.data()); } }, threadNumber)); if (tempInput->host() != inputPtr) { @@ -209,19 +225,29 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector& inputs, const std::vector& outputs) { auto batch = inputs[0]->batch(); + auto core = static_cast(backend())->functions(); + auto input = inputs[0]; + auto output = outputs[0]; + auto oc = output->channel(); + auto ocC4 = UP_DIV(output->channel(), core->pack); + auto icC4 = UP_DIV(input->channel(), core->pack); + auto width = output->width(); + auto height = output->height(); + auto src_height = input->height(); + auto src_width = input->width(); for (int i=0; ihost() + i * inputs[0]->stride(0); - auto outputPtr = outputs[0]->host() + i * outputs[0]->stride(0); + auto inputPtr = inputs[0]->host() + i * src_width * src_height * icC4 * core->pack * core->bytes; + auto outputPtr = outputs[0]->host() + i * width * height * ocC4 * core->pack * core->bytes; for (auto& unit : mPreFunctions) { MNN_CONCURRENCY_BEGIN(tId, unit.second) { - unit.first(inputPtr, (int)tId); + unit.first((float*)inputPtr, (int)tId); } MNN_CONCURRENCY_END(); } mMatMul->onExecute(); for (auto& unit : mPostFunctions) { MNN_CONCURRENCY_BEGIN(tId, unit.second) { - unit.first(outputPtr, (int)tId); + unit.first((float*)outputPtr, (int)tId); } MNN_CONCURRENCY_END(); } @@ -234,9 +260,11 @@ public: const MNN::Op* op, Backend* backend) const { auto convOp = op->main_as_Convolution2D(); auto common = convOp->common(); - if (common->strideY() > 1 || common->strideX() > 1) { - if (common->dilateX() == 1 && common->dilateY() == 1) { - return new DeconvolutionWithStride(inputs[0], op, backend); + if (backend->type() == MNN_FORWARD_CPU) { + if (common->strideY() > 1 || common->strideX() > 1) { + if (common->dilateX() == 1 && common->dilateY() == 1) { + return new DeconvolutionWithStride(inputs[0], op, backend); + } } } return new CPUDeconvolution(inputs[0], op, backend); diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp index 362ba798..1f253577 100644 --- a/source/backend/cpu/CPUDeconvolution.hpp +++ b/source/backend/cpu/CPUDeconvolution.hpp @@ -21,6 +21,7 @@ public: protected: int mSrcCount; + std::vector mPostParameters; }; class CPUDeconvolutionCommon : public CPUDeconvolutionBasic { diff --git a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp index 44b951ea..ecbe0e97 100644 --- a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp +++ b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp @@ -6,12 +6,11 @@ // Copyright © 2018, Alibaba Group Holding Limited // -#include "backend/cpu/CPUDeconvolutionDepthwise.hpp" +#include "CPUDeconvolutionDepthwise.hpp" #include #include "backend/cpu/CPUBackend.hpp" -#include "MNN_generated.h" #include "core/Macro.h" -#include "backend/cpu/compute/ConvOpt.h" +#include "compute/CommonOptFunction.h" #include "core/Concurrency.h" @@ -23,35 +22,33 @@ CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const int kw = layer->kernelX(); int kh = layer->kernelY(); int outputCount = layer->outputCount(); - int depthQuad = UP_DIV(outputCount, 4); - int planeStride = kw * kh * 4; - + auto core = static_cast(backend())->functions(); + int depthQuad = UP_DIV(outputCount, core->pack); const float* tempWeight = nullptr; int tempWeightSize = 0; std::shared_ptr quanCommon; ConvolutionCommon::getConvParameters(&quanCommon, conv, &tempWeight, &tempWeightSize); // Reorder weight from whc -> pwhc4 - int kernelSize = depthQuad * 4 * kw * kh; + int kernelSize = depthQuad * core->pack * kw * kh; mWeight.reset(Tensor::createDevice(std::vector{kernelSize})); auto sucess = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC); if (!sucess) { mValid = false; return; } - ::memset(mWeight->host(), 0, mWeight->size()); - auto weight = mWeight->host(); - int cur = 0; - for (int c = 0; c < outputCount; ++c) { - int plane = c / 4; - int offset = c % 4; - for (int y = 0; y < kh; ++y) { - for (int x = 0; x < kw; ++x) { - float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane; - *dst = tempWeight[cur++]; - } + AutoStorage weightTempStorage; + if (core->bytes < 4) { + weightTempStorage.reset(kernelSize * core->bytes); + if (weightTempStorage.get() == nullptr) { + mValid = false; + return; } + core->MNNFp32ToLowp(tempWeight, (int16_t*)weightTempStorage.get(), kernelSize); + tempWeight = (const float*)weightTempStorage.get(); } + auto weight = mWeight->host(); + core->MNNPackCUnit(weight, tempWeight, kw * kh, outputCount); mOrigin.reset(new CPUDeconvolutionDepthwiseBasic(input, convOp, b)); } @@ -63,8 +60,9 @@ ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector& outputs) { auto kw = mCommon->kernelX(); auto kh = mCommon->kernelY(); - mWeight.reset(Tensor::createDevice({UP_DIV(inputs[0]->channel(), 4), kh, kw, 4})); - mBias.reset(Tensor::createDevice({UP_DIV(inputs[0]->channel(), 4), 4})); + auto core = static_cast(backend())->functions(); + mWeight.reset(Tensor::createDevice({UP_DIV(inputs[0]->channel(), core->pack), kh, kw, core->pack})); + mBias.reset(Tensor::createDevice({UP_DIV(inputs[0]->channel(), core->pack), core->pack})); backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC); backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC); mInputs = {inputs[0], mWeight.get(), mBias.get()}; @@ -76,34 +74,25 @@ ErrorCode CPUDeconvolutionDepthwiseMultiInput::onResize(const std::vector& inputs, const std::vector& outputs) { - ::memset(mBias->host(), 0, mBias->size()); + auto core = static_cast(backend())->functions(); + ::memset(mBias->host(), 0, mBias->elementSize() * core->bytes); if (inputs.size() > 2) { - ::memcpy(mBias->host(), inputs[2]->host(), inputs[2]->size()); + ::memcpy(mBias->host(), inputs[2]->host(), inputs[2]->elementSize() * core->bytes); } - ::memset(mWeight->host(), 0, mWeight->size()); + ::memset(mWeight->host(), 0, mWeight->elementSize() * core->bytes); auto weight = mWeight->host(); auto outputCount = inputs[0]->channel(); auto kh = mWeight->length(1); auto kw = mWeight->length(2); auto tempWeight = inputs[1]->host(); - auto planeStride = kw * kh * 4; - int cur = 0; - for (int c = 0; c < outputCount; ++c) { - int plane = c / 4; - int offset = c % 4; - for (int y = 0; y < kh; ++y) { - for (int x = 0; x < kw; ++x) { - float* dst = weight + offset + (x + y * kw) * 4 + planeStride * plane; - *dst = tempWeight[cur++]; - } - } - } + core->MNNPackCUnit(weight, tempWeight, kw * kh, outputCount); return CPUDeconvolutionDepthwiseBasic::onExecute(mInputs, outputs); } ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector& inputs, const std::vector& outputs) { CPUDeconvolutionBasic::onResize(inputs, outputs); + auto core = static_cast(backend())->functions(); auto layer = mCommon; auto inputTensor = outputs[0]; auto outputTensor = inputs[0]; @@ -111,22 +100,22 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector& i int src_height = inputTensor->height(); int dst_width = outputTensor->width(); int dst_height = outputTensor->height(); - int dst_depth_quad = UP_DIV(layer->outputCount(), 4); - int dst_z_step = dst_width * dst_height * 4; - int src_z_step = src_width * src_height * 4; - int dst_y_step = dst_width * 4; - int src_y_step = src_width * 4; + int dst_depth_quad = UP_DIV(layer->outputCount(), core->pack); + int dst_z_step = dst_width * dst_height * core->pack; + int src_z_step = src_width * src_height * core->pack; + int dst_y_step = dst_width * core->pack; + int src_y_step = src_width * core->pack; int strideY = layer->strideY(); int strideX = layer->strideX(); int dilateX = layer->dilateX(); int dilateY = layer->dilateY(); - int dilateY_step = dilateY * src_width * 4; - int dilateX_step = dilateX * 4; + int dilateY_step = dilateY * src_width * core->pack; + int dilateX_step = dilateX * core->pack; int kernel_height = layer->kernelY(); int kernel_width = layer->kernelX(); int padX = mPadX; int padY = mPadY; - int weight_z_step = kernel_height * kernel_width * 4; + int weight_z_step = kernel_height * kernel_width * core->pack; // Compute Mid Rect int l = 0, t = 0, r = dst_width, b = dst_height; for (; l * strideX - padX < 0 && l < dst_width; l++) { @@ -142,23 +131,22 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector& i // do nothing } - auto postFunction = getPostFunction(); #define RUN_BASIC(L, T, R, B) \ for (int dy = T; dy < B; ++dy) { \ - const float* dst_y = dst_z + dy * dst_y_step; \ + auto dst_y = dst_z + dy * dst_y_step * core->bytes; \ int srcStartY = dy * strideY - padY; \ - float* src_dy = src_z + srcStartY * src_y_step; \ + auto src_dy = src_z + srcStartY * src_y_step * core->bytes; \ int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); \ int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); \ for (int dx = L; dx < R; ++dx) { \ - const float* dst_x = dst_y + 4 * dx; \ + auto dst_x = dst_y + core->pack * core->bytes * dx; \ int srcStartX = dx * strideX - padX; \ - float* src_dx = src_dy + srcStartX * 4; \ + auto src_dx = src_dy + srcStartX * core->pack * core->bytes; \ int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); \ int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); \ - MNNDeconvRunForUnitDepthWise(dst_x, src_dx + (sfx * dilateX + sfy * dilateY * src_width) * 4, \ - weight_dz + 4 * (kernel_width * sfy + sfx), efx - sfx, efy - sfy, \ - 4 * kernel_width, dilateX_step, dilateY_step); \ + core->MNNDeconvRunForUnitDepthWise((const float*)dst_x, (float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * core->bytes * core->pack), \ + (const float*)(weight_dz + core->pack * core->bytes * (kernel_width * sfy + sfx)), efx - sfx, efy - sfy, \ + core->pack * kernel_width, dilateX_step, dilateY_step); \ } \ } auto weight = inputs[1]; @@ -167,13 +155,13 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector& i int totalSize = batch * dst_depth_quad; int numberThread = ((CPUBackend*)backend())->threadNumber(); - mFunction = [=](const float* dstOrigin, float* srcOrigin, int tId) { + mFunction = [=](const uint8_t* dstOrigin, uint8_t* srcOrigin, int tId) { for (int dz = tId; dz < totalSize; dz+=numberThread) { auto zPos = dz % dst_depth_quad; - const float* dst_z = dstOrigin + dst_z_step * dz; - float* src_z = srcOrigin + src_z_step * dz; - const float* weight_dz = weight->host() + zPos * weight_z_step; - ::memset(src_z, 0, 4 * src_width * src_height * sizeof(float)); + auto dst_z = dstOrigin + dst_z_step * dz * core->bytes; + auto src_z = srcOrigin + src_z_step * dz * core->bytes; + auto weight_dz = weight->host() + zPos * weight_z_step * core->bytes; + ::memset(src_z, 0, src_width * src_height * core->bytes * core->pack); RUN_BASIC(0, 0, dst_width, t); RUN_BASIC(0, b, dst_width, dst_height); @@ -183,14 +171,14 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onResize(const std::vector& i if (r > l) { for (int dy = t; dy < b; ++dy) { - const float* dst_y = dst_z + dy * dst_y_step; + auto dst_y = dst_z + dy * dst_y_step * core->bytes; int srcStartY = dy * strideY - padY; - float* src_dy = src_z + srcStartY * src_y_step; - MNNDeconvRunForLineDepthwise(dst_y + l * 4, src_dy + (l * strideX - padX) * 4, weight_dz, r - l, - strideX * 4, kernel_width, kernel_height, dilateX_step, dilateY_step); + auto src_dy = src_z + srcStartY * src_y_step * core->bytes; + core->MNNDeconvRunForLineDepthwise((const float*)(dst_y + l * core->pack * core->bytes), (float*)(src_dy + (l * strideX - padX) * core->bytes * core->pack), (const float*)weight_dz, r - l, + strideX * core->pack, kernel_width, kernel_height, dilateX_step, dilateY_step); } } - postFunction(src_z, bias->host() + zPos * 4, src_width * src_height, 1); + core->MNNAxByClampBroadcastUnit((float*)src_z, (float*)src_z, (const float*)(bias->host() + zPos * core->pack * core->bytes), src_width * src_height, 0, 0, 1, mPostParameters.data()); } }; #undef RUN_BASIC @@ -204,8 +192,8 @@ ErrorCode CPUDeconvolutionDepthwiseBasic::onExecute(const std::vector& auto inputTensor = outputs[0]; auto outputTensor = inputs[0]; int numberThread = ((CPUBackend*)backend())->threadNumber(); - float* srcOrigin = inputTensor->host() + 0 * inputTensor->stride(0); - const float* dstOrigin = outputTensor->host() + 0 * outputTensor->stride(0); + auto srcOrigin = inputTensor->host(); + auto dstOrigin = outputTensor->host(); MNN_CONCURRENCY_BEGIN(tId, numberThread) { mFunction(dstOrigin, srcOrigin, tId); }; diff --git a/source/backend/cpu/CPUDeconvolutionDepthwise.hpp b/source/backend/cpu/CPUDeconvolutionDepthwise.hpp index d383aa33..65c42a5d 100644 --- a/source/backend/cpu/CPUDeconvolutionDepthwise.hpp +++ b/source/backend/cpu/CPUDeconvolutionDepthwise.hpp @@ -22,7 +22,7 @@ public: virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; private: - std::function mFunction; + std::function mFunction; }; class CPUDeconvolutionDepthwiseMultiInput : public CPUDeconvolutionDepthwiseBasic { diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp index 27ccd495..9a088554 100644 --- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp +++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp @@ -21,10 +21,12 @@ #define BASIC_TYPE int8_t #endif namespace MNN { -CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolution2D* dwConvParam) +CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolution2D* dwConvParam, float inputScale, float outputScale) : Execution(backend), mCommon(dwConvParam->common()) { auto common = dwConvParam->common(); mResource.reset(new CPUConvInt8::ResourceInt8); + mResource->mInputScale = inputScale; + mResource->mOutputScale = outputScale; mResource->mRelu = common->relu6() || common->relu(); mResource->backend = backend; const int kx = common->kernelX(); @@ -35,7 +37,6 @@ CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolut const int weightSizeAlign = ocDivUnit * UNIT * kernelSize; mResource->mWeightInt8.reset(Tensor::createDevice({weightSizeAlign})); - const auto *originWeight = dwConvParam->symmetricQuan()->weight()->data(); auto allocRes = backend->onAcquireBuffer(mResource->mWeightInt8.get(), Backend::STATIC); if (!allocRes) { mValid = false; @@ -43,10 +44,27 @@ CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolut } auto weightPtr = mResource->mWeightInt8->host(); memset(weightPtr, 0, weightSizeAlign * sizeof(BASIC_TYPE)); + mResource->mBiasInt32.reset(Tensor::createDevice({ocDivUnit * UNIT})); + allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC); + if (!allocRes) { + mValid = false; + return; + } + mResource->mScaleFloat.reset(Tensor::createDevice({ocDivUnit * UNIT})); + allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC); + if (!allocRes) { + mValid = false; + return; + } + auto biasPtr = mResource->mBiasInt32->host(); + auto scalePtr = mResource->mScaleFloat->host(); + memset(biasPtr, 0, ocDivUnit * UNIT * sizeof(int32_t)); + memset(scalePtr, 0, ocDivUnit * UNIT * sizeof(float)); + const int8_t* originWeight = nullptr; + std::shared_ptr quanCommon; - if (dwConvParam->quanParameter() != nullptr) { - quanCommon = ConvolutionCommon::load(dwConvParam->quanParameter(), false); - originWeight = quanCommon->weight.get(); + if (!ConvolutionCommon::getConvInt8Parameters(dwConvParam, quanCommon, originWeight, scalePtr, biasPtr, inputScale, outputScale)) { + return; } int cur = 0; for (int dz = 0; dz < outputCount; ++dz) { @@ -57,27 +75,6 @@ CPUDepthwiseConvInt8::CPUDepthwiseConvInt8(Backend* backend, const MNN::Convolut dstDz[i * UNIT + my] = originWeight[cur++]; } } - - mResource->mBiasInt32.reset(Tensor::createDevice({ocDivUnit * UNIT})); - allocRes = backend->onAcquireBuffer(mResource->mBiasInt32.get(), Backend::STATIC); - if (!allocRes) { - mValid = false; - return; - } - auto biasPtr = mResource->mBiasInt32->host(); - memset(biasPtr, 0, ocDivUnit * UNIT * sizeof(int32_t)); - memcpy(biasPtr, dwConvParam->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t)); - - mResource->mScaleFloat.reset(Tensor::createDevice({ocDivUnit * UNIT})); - allocRes = backend->onAcquireBuffer(mResource->mScaleFloat.get(), Backend::STATIC); - if (!allocRes) { - mValid = false; - return; - } - auto scalePtr = mResource->mScaleFloat->host(); - memset(scalePtr, 0, ocDivUnit * UNIT * sizeof(float)); - memcpy(scalePtr, dwConvParam->symmetricQuan()->scale()->data(), outputCount * sizeof(float)); - mResource->mInputZeroPoint = dwConvParam->symmetricQuan()->zeroPoint(); mResource->mOutputZeroPoint = dwConvParam->symmetricQuan()->outputZeroPoint(); mResource->mClampMin = dwConvParam->symmetricQuan()->clampMin(); @@ -100,6 +97,7 @@ bool CPUDepthwiseConvInt8::onClone(Backend* bn, const Op* op, Execution** dst) { ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector& inputs, const std::vector& outputs) { auto input = inputs[0]; auto output = outputs[0]; + mResource->updateInputOutputScale(TensorUtils::getScale(input), TensorUtils::getScale(output)); auto pads = ConvolutionCommon::convolutionPadFull(input, output, mCommon); int padX = std::get<0>(pads); @@ -214,7 +212,13 @@ class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { - return new CPUDepthwiseConvInt8(backend, op->main_as_Convolution2D()); + float inputScale = 0.0f; + float outputScale = 0.0f; + if (inputs.size() > 0) { + inputScale = TensorUtils::getScale(inputs[0]); + outputScale = TensorUtils::getScale(outputs[0]); + } + return new CPUDepthwiseConvInt8(backend, op->main_as_Convolution2D(), inputScale, outputScale); } }; diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.hpp b/source/backend/cpu/CPUDepthwiseConvInt8.hpp index eba34928..11c27588 100644 --- a/source/backend/cpu/CPUDepthwiseConvInt8.hpp +++ b/source/backend/cpu/CPUDepthwiseConvInt8.hpp @@ -14,7 +14,7 @@ namespace MNN { class CPUDepthwiseConvInt8 : public Execution { public: - CPUDepthwiseConvInt8(Backend *backend, const MNN::Convolution2D *convOp); + CPUDepthwiseConvInt8(Backend *backend, const MNN::Convolution2D *convOp, float inputScale, float outputScale); virtual ~CPUDepthwiseConvInt8(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; diff --git a/source/backend/cpu/CPUEltwiseInt8.cpp b/source/backend/cpu/CPUEltwiseInt8.cpp index 95b699c8..1f51008b 100644 --- a/source/backend/cpu/CPUEltwiseInt8.cpp +++ b/source/backend/cpu/CPUEltwiseInt8.cpp @@ -10,6 +10,7 @@ #include "backend/cpu/CPUBackend.hpp" #include "core/Concurrency.h" #include "core/Macro.h" +#include "core/TensorUtils.hpp" extern "C" { void MNNScaleAddInt8(int8_t* dst, const int8_t* src0, const int8_t* src1, const float* scale0, const float* scale1, @@ -19,6 +20,10 @@ void MNNScaleAddInt8(int8_t* dst, const int8_t* src0, const int8_t* src1, const namespace MNN { CPUEltwiseInt8::CPUEltwiseInt8(Backend* backend, const Op* op) : Execution(backend) { + isEltwiseInt8 = op->type() == OpType_EltwiseInt8; + if (!isEltwiseInt8) { + return; + } auto param = op->main_as_EltwiseInt8(); auto copyData = [=](std::shared_ptr& tensor, const QuantizedFloatParam* scale) { const int size = scale->tensorScale()->size(); @@ -37,6 +42,9 @@ CPUEltwiseInt8::CPUEltwiseInt8(Backend* backend, const Op* op) : Execution(backe } CPUEltwiseInt8::~CPUEltwiseInt8() { + if (!isEltwiseInt8) { + return; + } backend()->onReleaseBuffer(mInput0Scales.get(), Backend::STATIC); backend()->onReleaseBuffer(mInput1Scales.get(), Backend::STATIC); backend()->onReleaseBuffer(mOutputScales.get(), Backend::STATIC); @@ -53,9 +61,20 @@ ErrorCode CPUEltwiseInt8::onExecute(const std::vector& inputs, const st const int height = input0->height(); const int oc4Stride = width * height; - const auto scale0Ptr = mInput0Scales->host(); - const auto scale1Ptr = mInput1Scales->host(); - const auto outputScalePtr = mOutputScales->host(); + const float *scale0Ptr, *scale1Ptr, *outputScalePtr; + std::vector scale0(input0->channel()), scale1(input1->channel()), outputScale(output->channel()); + if (isEltwiseInt8) { + scale0Ptr = mInput0Scales->host(); + scale1Ptr = mInput1Scales->host(); + outputScalePtr = mOutputScales->host(); + } else { + std::fill(scale0.begin(), scale0.end(), TensorUtils::getDescribe(input0)->quantAttr->scale); + std::fill(scale1.begin(), scale1.end(), TensorUtils::getDescribe(input1)->quantAttr->scale); + std::fill(outputScale.begin(), outputScale.end(), 1 / TensorUtils::getDescribe(output)->quantAttr->scale); + scale0Ptr = scale0.data(); + scale1Ptr = scale1.data(); + outputScalePtr = outputScale.data(); + } for (int bIndex = 0; bIndex < batch; ++bIndex) { const auto src0Batch = input0->host() + bIndex * batchStride; diff --git a/source/backend/cpu/CPUEltwiseInt8.hpp b/source/backend/cpu/CPUEltwiseInt8.hpp index 4dc8940f..dc829704 100644 --- a/source/backend/cpu/CPUEltwiseInt8.hpp +++ b/source/backend/cpu/CPUEltwiseInt8.hpp @@ -23,6 +23,7 @@ private: std::shared_ptr mInput0Scales; std::shared_ptr mInput1Scales; std::shared_ptr mOutputScales; + bool isEltwiseInt8 = true; }; } // namespace MNN diff --git a/source/backend/cpu/CPUGridSample.cpp b/source/backend/cpu/CPUGridSample.cpp new file mode 100644 index 00000000..e7f4aa77 --- /dev/null +++ b/source/backend/cpu/CPUGridSample.cpp @@ -0,0 +1,172 @@ +// +// CPUGridSample.cpp +// MNN +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "backend/cpu/CPUGridSample.hpp" +#include +#include +#include "core/Concurrency.h" +#include +#include "backend/cpu/CPUBackend.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" +#include "backend/cpu/compute/ConvOpt.h" +#include "core/Macro.h" +#include +using Vec4 = MNN::Math::Vec; + +namespace MNN { +CPUGridSample::CPUGridSample(Backend *b, SampleMode mode, BorderMode paddingMode, bool alignCorners) + : Execution(b) { + mMode = mode; + mPaddingMode = paddingMode; + mAlignCorners = alignCorners; +} + +static float getPosition(float x, int range, bool alignCorners) { + float a = alignCorners ? 1.0f : 0.0f; + float b = alignCorners ? 0.0f : 1.0f; + return ((1 + x) * (range - a) - b) / 2.0f; +} + +static int CLAMP(int v, int min, int max) { + if ((v) < min) { + (v) = min; + } else if ((v) > max) { + (v) = max; + } + return v; +} + +static Vec4 sample(int h, int w, const float *buffer, int height, int width, BorderMode padMode) { + if (h < 0 || h >= height || w < 0 || w >= width) { + if(padMode == BorderMode_ZEROS) { + return 0.0f; + } + // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER + // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1), + // the leftover reflections degrade to GridSamplePaddingMode_BORDER + h = CLAMP(h, 0, height - 1); + w = CLAMP(w, 0, width - 1); + } + + return Vec4::load(buffer + h * width * 4 + w * 4); +} + +static Vec4 interpolate(float h, float w, const float *buffer, int height, int width, SampleMode mode, BorderMode padMode) { + if (mode == SampleMode_NEAREST) { + int nh = ::floor(h+0.5f); + int nw = ::floor(w+0.5f); + return sample(nh, nw, buffer, height, width, padMode); + } + // mode == GridSampleMode_BILINEAR + int w0_h = ::floor(h); + int w0_w = ::floor(w); + int w1_h = ::ceil(h); + int w1_w = ::ceil(w); + auto oneV = Vec4(1.0f); + + Vec4 i00 = sample(w0_h, w0_w, buffer, height, width, padMode); + Vec4 i01 = sample(w0_h, w1_w, buffer, height, width, padMode); + Vec4 i10 = sample(w1_h, w0_w, buffer, height, width, padMode); + Vec4 i11 = sample(w1_h, w1_w, buffer, height, width, padMode); + auto f0 = Vec4((float)w1_w - w); + auto f1 = oneV - f0; + auto h0 = Vec4((float)w1_h - h); + auto h1 = oneV - h0; + + Vec4 i0 = i00 * f0 + i01 * f1; + Vec4 i1 = i10 * f0 + i11 * f1; + + return i0 * h0 + i1 * h1; +} + + +ErrorCode CPUGridSample::onResize(const std::vector &inputs, const std::vector &outputs) { + int numberThread = static_cast(backend())->threadNumber(); + auto outputTensor = outputs[0]; + auto outH = outputTensor->buffer().dim[2].extent; + auto outW = outputTensor->buffer().dim[3].extent; + mTempCordBuffer.reset(Tensor::createDevice({1, outH * outW * 2})); + auto res = backend()->onAcquireBuffer(mTempCordBuffer.get(), Backend::DYNAMIC); + if (!res) { + return OUT_OF_MEMORY; + } + backend()->onReleaseBuffer(mTempCordBuffer.get(), Backend::DYNAMIC); + return NO_ERROR; +} + +ErrorCode CPUGridSample::onExecute(const std::vector &inputs, const std::vector &outputs) { + auto inputTensor = inputs[0]; + auto gridTensor = inputs[1]; + auto outputTensor = outputs[0]; + + float *inputPtr = inputTensor->host(); + float *gridPtr = gridTensor->host(); + auto *outputPtr = outputTensor->host(); + + auto batches = inputTensor->buffer().dim[0].extent; + auto channels = inputTensor->buffer().dim[1].extent; + auto channelC4 = UP_DIV(channels, 4); + auto inH = inputTensor->buffer().dim[2].extent; + auto inW = inputTensor->buffer().dim[3].extent; + auto outH = outputTensor->buffer().dim[2].extent; + auto outW = outputTensor->buffer().dim[3].extent; + auto cordPtr = mTempCordBuffer->host(); + auto threadCount = static_cast(backend())->threadNumber(); + auto tileCount = channelC4 * outH; + for (auto b = 0; b < batches; ++b) { + const float *_inputPtr = inputPtr + b * inputTensor->buffer().dim[0].stride; + const float *_gridPtr = gridPtr + b * gridTensor->buffer().dim[0].stride; + float *_outputPtr = outputPtr + b * outputTensor->buffer().dim[0].stride; + // Compute cord + for (auto h = 0; h < outH; ++h) { + auto __gridPtr = _gridPtr + h * gridTensor->buffer().dim[1].stride; + auto cordH = cordPtr + h * outW * 2; + for (auto w = 0; w < outW; ++w) { + auto x = getPosition(__gridPtr[2 * w + 0], inW, mAlignCorners); + auto y = getPosition(__gridPtr[2 * w + 1], inH, mAlignCorners); + cordH[2 * w + 0] = x; + cordH[2 * w + 1] = y; + } + } + MNN_CONCURRENCY_BEGIN(tId, threadCount) { + for (int index=tId; index < tileCount; index += threadCount) { + auto c = index / outH; + auto h = index % outH; + auto inpC = _inputPtr + c * inW * inH * 4; + auto outC = _outputPtr + c * outW * outH * 4; + auto cordH = cordPtr + h * outW * 2; + auto outH = outC + h * outW * 4; + for (auto w = 0; w < outW; ++w) { + auto x = cordH[2 * w + 0]; + auto y = cordH[2 * w + 1]; + Vec4::save(outH + 4 * w, interpolate(y, x, inpC, inH, inW, mMode, mPaddingMode)); + } + } + } + MNN_CONCURRENCY_END(); + } + + return NO_ERROR; +} + +class CPUGridSampleCreator : public CPUBackend::Creator { +public: + virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, + const MNN::Op *op, Backend *backend) const { + auto gridSampleParam = op->main_as_GridSample(); + auto mode = gridSampleParam->mode(); + auto paddingMode = gridSampleParam->paddingMode(); + auto alignCorners = gridSampleParam->alignCorners(); + return new CPUGridSample(backend, mode, paddingMode, alignCorners); + } +}; + +REGISTER_CPU_OP_CREATOR(CPUGridSampleCreator, OpType_GridSample); + + +} // namespace MNN diff --git a/source/backend/cpu/CPUGridSample.hpp b/source/backend/cpu/CPUGridSample.hpp new file mode 100644 index 00000000..5ac66cb2 --- /dev/null +++ b/source/backend/cpu/CPUGridSample.hpp @@ -0,0 +1,32 @@ +// +// CPUGridSample.hpp +// MNN +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef CPUGridSample_hpp +#define CPUGridSample_hpp + +#include "core/Execution.hpp" +#include "MNN_generated.h" + +namespace MNN { +class CPUGridSample : public Execution { +public: + CPUGridSample(Backend *b, SampleMode mode, BorderMode paddingMode, bool alignCorners); + virtual ~CPUGridSample() = default; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + +private: + SampleMode mMode; + BorderMode mPaddingMode; + bool mAlignCorners; + std::shared_ptr mTempCordBuffer; +}; + +} // namespace MNN + +#endif /* CPUGridSample_hpp */ diff --git a/source/backend/cpu/CPUMatMul.cpp b/source/backend/cpu/CPUMatMul.cpp index 710d793b..9eef5217 100644 --- a/source/backend/cpu/CPUMatMul.cpp +++ b/source/backend/cpu/CPUMatMul.cpp @@ -12,6 +12,7 @@ #include "compute/CommonOptFunction.h" #include "core/Macro.h" #include "core/Concurrency.h" +#include "core/AutoStorage.h" #include "math/Vec.hpp" #include @@ -22,58 +23,6 @@ CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool mu : Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mSupportMultiThread(multiThread) { mComputer.reset(new StrassenMatrixComputor(backend, mSupportMultiThread, 5)); } -static void _TransposeUnpackC4MultiThread(float* BPtr, const float* BTempPtr, int tId, int hC4, int l, int h, int numberThread) { - for (int y = tId; y < hC4 - 1; y+=numberThread) { - auto src = y * 4 + BPtr; - auto dst = y * 4 * l + BTempPtr; - for (int x = 0; x< l ; ++x) { - auto srcX = src + x * h; - auto dstX = dst + 4 * x; - Vec4::save(srcX, Vec4::load(dstX)); - } - } - if (tId != numberThread - 1) { - return; - } - int lastY = 4 * (hC4 - 1); - int remain = h - lastY; - auto lastDst = BTempPtr + lastY * l; - auto lastSrc = lastY + BPtr; - for (int x=0; x(backend())->threadNumber() : 1; @@ -154,6 +103,7 @@ ErrorCode CPUMatMul::onResize(const std::vector& inputs, const std::vec } auto w0 = inputs[0]->length(1); auto h0 = inputs[0]->length(0); + auto core = static_cast(backend())->functions(); mComputer->onReset(); mPreFunctions.clear(); mPostFunctions.clear(); @@ -163,40 +113,40 @@ ErrorCode CPUMatMul::onResize(const std::vector& inputs, const std::vec if (mTransposeA) { l = h0; } - if (h == 1) { - const float* biasPtr = nullptr; - if (inputs.size() > 2) { - auto bias = inputs[2]; - biasPtr = bias->host(); + if (core->bytes == 4) { + if (h == 1) { + const float* biasPtr = nullptr; + if (inputs.size() > 2) { + auto bias = inputs[2]; + biasPtr = bias->host(); + } + _scheduleForVec(C->host(), biasPtr, e, l, h); + return NO_ERROR; } - _scheduleForVec(C->host(), biasPtr, e, l, h); - return NO_ERROR; - } - if (e == 1) { - const float* biasPtr = nullptr; - if (inputs.size() > 2) { - auto bias = inputs[2]; - biasPtr = bias->host(); + if (e == 1) { + const float* biasPtr = nullptr; + if (inputs.size() > 2) { + auto bias = inputs[2]; + biasPtr = bias->host(); + } + _scheduleForVecE(C->host(), biasPtr, e, l, h); + return NO_ERROR; } - _scheduleForVecE(C->host(), biasPtr, e, l, h); - return NO_ERROR; } int eP, lP, hP; - MNNGetMatMulPackMode(&eP, &lP, &hP); - std::shared_ptr AT(Tensor::createDevice({UP_DIV(l, 4), e, 4})); - std::shared_ptr BT(Tensor::createDevice({UP_DIV(h, hP), l, hP})); - std::shared_ptr CT(Tensor::createDevice({UP_DIV(h, 4), e, 4})); + core->MNNGetMatMulPackMode(&eP, &lP, &hP); + AutoRelease AT(Tensor::createDevice({UP_DIV(l, core->pack), e, core->pack})); + AutoRelease BT(Tensor::createDevice({UP_DIV(h, hP), UP_DIV(l, lP) * lP, hP})); + AutoRelease CT(Tensor::createDevice({UP_DIV(h, core->pack), e, core->pack})); auto res = backend()->onAcquireBuffer(BT.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; } auto BTPtr = BT->host(); float* BTempPtr = BTPtr; - auto hC4 = UP_DIV(h, 4); - auto lC4 = UP_DIV(l, 4); int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1; - mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this] (int tId, const float* APtr, const float* BPtr) { - MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB); + mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr) { + core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB); } , 1)); res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC); res = res && backend()->onAcquireBuffer(CT.get(), Backend::DYNAMIC); @@ -206,25 +156,25 @@ ErrorCode CPUMatMul::onResize(const std::vector& inputs, const std::vec auto ATPtr = AT->host(); if (mTransposeA) { // l, e -> lC4, e, 4 - mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l](int tId, const float* APtr, const float* BPtr) { - MNNPackC4(ATPtr, APtr, e, l); + mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr) { + core->MNNPackCUnit(ATPtr, APtr, e, l); }, 1)); } else { // e, l -> lC4, e, 4 mPreFunctions.emplace_back(std::make_pair( - [ATPtr, e, l, lC4, numberThread](int tId, const float* APtr, const float* BPtr) { - _TransposePackC4MultiThread(APtr, ATPtr, tId, lC4, e, l, numberThread); - }, numberThread)); + [ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr) { + core->MNNPackCUnitTranspose(ATPtr, APtr, e, l); + }, 1)); } - std::shared_ptr biasWrap; + AutoRelease biasWrap; std::vector strassenInputs = {AT.get(), BT.get()}; std::vector postParameters; if (inputs.size() > 2) { auto bias = inputs[2]; auto biasLength = bias->elementSize(); - if (biasLength % 4 != 0) { + if (biasLength % core->pack != 0) { // Padding to align of 4 - biasWrap.reset(Tensor::createDevice({UP_DIV(biasLength, 4) * 4})); + biasWrap.reset(Tensor::createDevice({UP_DIV(biasLength, core->pack) * core->pack})); res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC); if (!res) { return OUT_OF_MEMORY; @@ -232,9 +182,9 @@ ErrorCode CPUMatMul::onResize(const std::vector& inputs, const std::vec auto borigin = bias->host(); auto bdest = biasWrap->host(); mPreFunctions.emplace_back(std::make_pair( - [borigin, biasLength, bdest](int tId, const float* APtr, const float* BPtr) { - ::memset(bdest, 0, UP_DIV(biasLength, 4) * 4 * sizeof(float)); - ::memcpy(bdest, borigin, biasLength * sizeof(float)); + [borigin, biasLength, bdest, core](int tId, const float* APtr, const float* BPtr) { + ::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack); + ::memcpy(bdest, borigin, biasLength * core->bytes); }, 1)); strassenInputs.emplace_back(biasWrap.get()); } else { @@ -251,13 +201,16 @@ ErrorCode CPUMatMul::onResize(const std::vector& inputs, const std::vec if (NO_ERROR != code) { return code; } - auto CTPtr = CT->host(); + if (nullptr != biasWrap.get()) { + backend()->onReleaseBuffer(biasWrap.get(), Backend::DYNAMIC); + } + auto CTPtr = CT->host(); // hC4, e, 4 -> e, h - mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, hC4, numberThread]( + mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core]( int tId, const float* APtr, const float* BPtr, float* CPtr) { - _TransposeUnpackC4MultiThread(CPtr, CTPtr, tId, hC4, e, h, numberThread); - }, numberThread)); + core->MNNUnpackCUnitTranspose(CPtr, CTPtr, e, h); + }, 1)); backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(BT.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(CT.get(), Backend::DYNAMIC); @@ -268,7 +221,8 @@ ErrorCode CPUMatMul::onExecute(const std::vector& inputs, const std::ve // Fill output by zero if one of inputs is empty. if (inputs.size() == 2 && outputs.size() == 1 && (inputs[0]->elementSize() == 0 || inputs[1]->elementSize() == 0)) { - ::memset(outputs[0]->host(), 0, outputs[0]->size()); + auto core = static_cast(backend())->functions(); + ::memset(outputs[0]->host(), 0, outputs[0]->elementSize() * core->bytes); return NO_ERROR; } @@ -292,11 +246,108 @@ ErrorCode CPUMatMul::onExecute(const std::vector& inputs, const std::ve return NO_ERROR; } + + +class CPUMultiMatMul : public Execution { +public: + CPUMultiMatMul(Backend *backend, bool transposeA, bool tranposeB) : Execution(backend) { + mMatMul.reset(new CPUMatMul(backend, transposeA, tranposeB, true)); + } + virtual ~CPUMultiMatMul() = default; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override { + auto input0 = inputs[0]; + auto input1 = inputs[1]; + auto output = outputs[0]; + auto core = static_cast(backend())->functions(); + auto i0Dim = input0->dimensions(); + auto i1Dim = input1->dimensions(); + auto o0Dim = output->dimensions(); + const int input0Stride = input0->length(i0Dim - 1) * input0->length(i0Dim - 2); + const int input1Stride = input1->length(i1Dim - 1) * input1->length(i1Dim - 2); + const int outputStride = output->length(o0Dim - 1) * output->length(o0Dim - 2); + // Compute BroastCast Dims + auto dimOffset = o0Dim - 2; + const int maxDimensions = dimOffset; + std::vector outputStrides(maxDimensions); + std::vector input0Strides(maxDimensions, 0); + std::vector input1Strides(maxDimensions, 0); + auto i0Offset = output->dimensions() - input0->dimensions(); + auto i1Offset = output->dimensions() - input1->dimensions(); + int totalSize = 1; + int i0Size = 1; + int i1Size = 1; + for (int i = maxDimensions - 1; i >=0 ; --i) { + outputStrides[i] = totalSize; + totalSize *= output->length(i); + if (i >= i0Offset && input0->length(i - i0Offset) > 1) { + input0Strides[i] = i0Size; + i0Size *= input0->length(i - i0Offset); + } + if (i >= i1Offset && input1->length(i - i1Offset) > 1) { + input1Strides[i] = i1Size; + i1Size *= input1->length(i - i1Offset); + } + } + auto input0Ptr = input0->host(); + auto input1Ptr = input1->host(); + auto outputPtr = output->host(); + for (int index = 0; index < totalSize; ++index) { + // Unrool the cords + auto c = index; + i0Offset = 0; + i1Offset = 0; + for (int i=0; ihost(), input0Ptr + i0Offset * input0Stride * core->bytes, input0Stride * core->bytes); + ::memcpy(mMatrixB->host(), input1Ptr + i1Offset * input1Stride * core->bytes, input1Stride * core->bytes); + mMatMul->onExecute(mTempInputs, mTempOutputs); + ::memcpy(outputPtr + index * outputStride * core->bytes, mMatrixC->host(), outputStride * core->bytes); + } + return NO_ERROR; + } + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override { + auto input0 = inputs[0]; + auto input1 = inputs[1]; + auto output = outputs[0]; + mMatrixA.reset(Tensor::createDevice({input0->length(input0->dimensions()-2), input0->length(input0->dimensions()-1)})); + mMatrixB.reset(Tensor::createDevice({input1->length(input1->dimensions()-2), input1->length(input1->dimensions()-1)})); + mMatrixC.reset(Tensor::createDevice({output->length(output->dimensions()-2), output->length(output->dimensions()-1)})); + mTempInputs = {mMatrixA.get(), mMatrixB.get()}; + mTempOutputs = {mMatrixC.get()}; + auto res = backend()->onAcquireBuffer(mMatrixA.get(), Backend::DYNAMIC); + res = res && backend()->onAcquireBuffer(mMatrixB.get(), Backend::DYNAMIC); + res = res && backend()->onAcquireBuffer(mMatrixC.get(), Backend::DYNAMIC); + + if (!res) { + return OUT_OF_MEMORY; + } + auto code = mMatMul->onResize(mTempInputs, mTempOutputs); + backend()->onReleaseBuffer(mMatrixA.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mMatrixB.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mMatrixC.get(), Backend::DYNAMIC); + return code; + } +private: + std::shared_ptr mMatMul; + std::vector mTempInputs; + std::vector mTempOutputs; + std::shared_ptr mMatrixA; + std::shared_ptr mMatrixB; + std::shared_ptr mMatrixC; +}; + class CPUMatMulCreator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { auto param = op->main_as_MatMul(); + if (outputs[0]->dimensions() > 2) { + return new CPUMultiMatMul(backend, param->transposeA(), param->transposeB()); + } return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true); } }; diff --git a/source/backend/cpu/CPUOPRegister.cpp b/source/backend/cpu/CPUOPRegister.cpp index 53250d3d..ce057edf 100644 --- a/source/backend/cpu/CPUOPRegister.cpp +++ b/source/backend/cpu/CPUOPRegister.cpp @@ -9,21 +9,18 @@ extern void ___CPUSelectCreator__OpType_Select__(); extern void ___CPUSoftmaxCreator__OpType_Softmax__(); extern void ___CPUDetectionPostProcessCreator__OpType_DetectionPostProcess__(); extern void ___CPUCastCreator__OpType_Cast__(); -extern void ___CPUSoftmaxGradCreator__OpType_SoftmaxGrad__(); extern void ___CPUProposalCreator__OpType_Proposal__(); extern void ___CPUInterpCreator__OpType_Interp__(); +extern void ___CPUGridSampleCreator__OpType_GridSample__(); extern void ___CPUConstCreator__OpType_Const__(); extern void ___CPUConstCreator__OpType_TrainableParam__(); extern void ___CPUDetectionOutputCreator__OpType_DetectionOutput__(); -extern void ___CPUSizeCreator__OpType_Size__(); extern void ___CPUUnravelIndexCreator__OpType_UnravelIndex__(); extern void ___CPUMatMulCreator__OpType_MatMul__(); extern void ___CPUMomentsCreator__OpType_Moments__(); extern void ___CPUInstanceNormCreator__OpType_InstanceNorm__(); extern void ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__(); extern void ___CPUWhereCreator__OpType_Where__(); -extern void ___CPUReluGradCreator__OpType_ReluGrad__(); -extern void ___CPUReluGradCreator__OpType_Relu6Grad__(); extern void ___CPUQuantizedMaxPoolCreator__OpType_QuantizedMaxPool__(); extern void ___CPUDeconvolutionCreator__OpType_Deconvolution__(); extern void ___CPUBinaryCreator__OpType_BinaryOp__(); @@ -31,13 +28,11 @@ extern void ___CPUDepthwiseCreator__OpType_QuantizedDepthwiseConv2D__(); extern void ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__(); extern void ___CPUPoolCreator__OpType_Pooling__(); extern void ___CPUScatterNdCreator__OpType_ScatterNd__(); -extern void ___CPUShapeCreator__OpType_Shape__(); extern void ___CPUPluginCreator__OpType_Plugin__(); extern void ___CPUInt8ToFloatCreator__OpType_Int8ToFloat__(); extern void ___CPUROIPoolingCreator__OpType_ROIPooling__(); extern void ___CPUTopKV2Creator__OpType_TopKV2__(); extern void ___CPUUnaryCreator__OpType_UnaryOp__(); -extern void ___CPUSigmoidCreator__OpType_Sigmoid__(); extern void ___CPUReductionCreator__OpType_Reduction__(); extern void ___CPUGatherNDCreator__OpType_GatherND__(); extern void ___CPUReluCreator__OpType_ReLU__(); @@ -50,7 +45,6 @@ extern void ___CPUMatrixBandPartCreator__OpType_MatrixBandPart__(); extern void ___CPUQuantizedAddCreator__OpType_QuantizedAdd__(); extern void ___CPUDeconvolutionDepthwiseCreator__OpType_DeconvolutionDepthwise__(); extern void ___CPUFloatToInt8Creator__OpType_FloatToInt8__(); -extern void ___CPURankCreator__OpType_Rank__(); extern void ___CPULinSpaceCreator__OpType_LinSpace__(); extern void ___CPUNonMaxSuppressionV2Creator__OpType_NonMaxSuppressionV2__(); extern void ___CPUGatherV2Creator__OpType_GatherV2__(); @@ -68,7 +62,6 @@ extern void ___CPUAsStringCreator__OpType_AsString__(); extern void ___CPURandomUniformCreator__OpType_RandomUniform__(); extern void ___CPUSetDiff1DCreator__OpType_SetDiff1D__(); extern void ___CPUReduceJoinCreator__OpType_ReduceJoin__(); -extern void ___CPUPriorBoxCreator__OpType_PriorBox__(); extern void ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__(); extern void ___CPUBatchMatMulCreator__OpType_BatchMatMul__(); extern void ___CPULayerNormCreator__OpType_LayerNorm__(); @@ -83,21 +76,18 @@ ___CPUSelectCreator__OpType_Select__(); ___CPUSoftmaxCreator__OpType_Softmax__(); ___CPUDetectionPostProcessCreator__OpType_DetectionPostProcess__(); ___CPUCastCreator__OpType_Cast__(); -___CPUSoftmaxGradCreator__OpType_SoftmaxGrad__(); ___CPUProposalCreator__OpType_Proposal__(); ___CPUInterpCreator__OpType_Interp__(); +___CPUGridSampleCreator__OpType_GridSample__(); ___CPUConstCreator__OpType_Const__(); ___CPUConstCreator__OpType_TrainableParam__(); ___CPUDetectionOutputCreator__OpType_DetectionOutput__(); -___CPUSizeCreator__OpType_Size__(); ___CPUUnravelIndexCreator__OpType_UnravelIndex__(); ___CPUMatMulCreator__OpType_MatMul__(); ___CPUMomentsCreator__OpType_Moments__(); ___CPUInstanceNormCreator__OpType_InstanceNorm__(); ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__(); ___CPUWhereCreator__OpType_Where__(); -___CPUReluGradCreator__OpType_ReluGrad__(); -___CPUReluGradCreator__OpType_Relu6Grad__(); ___CPUQuantizedMaxPoolCreator__OpType_QuantizedMaxPool__(); ___CPUDeconvolutionCreator__OpType_Deconvolution__(); ___CPUBinaryCreator__OpType_BinaryOp__(); @@ -105,13 +95,11 @@ ___CPUDepthwiseCreator__OpType_QuantizedDepthwiseConv2D__(); ___CPUQuantizedSoftmaxCreator__OpType_QuantizedSoftmax__(); ___CPUPoolCreator__OpType_Pooling__(); ___CPUScatterNdCreator__OpType_ScatterNd__(); -___CPUShapeCreator__OpType_Shape__(); ___CPUPluginCreator__OpType_Plugin__(); ___CPUInt8ToFloatCreator__OpType_Int8ToFloat__(); ___CPUROIPoolingCreator__OpType_ROIPooling__(); ___CPUTopKV2Creator__OpType_TopKV2__(); ___CPUUnaryCreator__OpType_UnaryOp__(); -___CPUSigmoidCreator__OpType_Sigmoid__(); ___CPUReductionCreator__OpType_Reduction__(); ___CPUGatherNDCreator__OpType_GatherND__(); ___CPUReluCreator__OpType_ReLU__(); @@ -124,7 +112,6 @@ ___CPUMatrixBandPartCreator__OpType_MatrixBandPart__(); ___CPUQuantizedAddCreator__OpType_QuantizedAdd__(); ___CPUDeconvolutionDepthwiseCreator__OpType_DeconvolutionDepthwise__(); ___CPUFloatToInt8Creator__OpType_FloatToInt8__(); -___CPURankCreator__OpType_Rank__(); ___CPULinSpaceCreator__OpType_LinSpace__(); ___CPUNonMaxSuppressionV2Creator__OpType_NonMaxSuppressionV2__(); ___CPUGatherV2Creator__OpType_GatherV2__(); @@ -142,7 +129,6 @@ ___CPUAsStringCreator__OpType_AsString__(); ___CPURandomUniformCreator__OpType_RandomUniform__(); ___CPUSetDiff1DCreator__OpType_SetDiff1D__(); ___CPUReduceJoinCreator__OpType_ReduceJoin__(); -___CPUPriorBoxCreator__OpType_PriorBox__(); ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__(); ___CPUBatchMatMulCreator__OpType_BatchMatMul__(); ___CPULayerNormCreator__OpType_LayerNorm__(); diff --git a/source/backend/cpu/CPUPool.cpp b/source/backend/cpu/CPUPool.cpp index 1835ed73..05b65e47 100644 --- a/source/backend/cpu/CPUPool.cpp +++ b/source/backend/cpu/CPUPool.cpp @@ -7,368 +7,22 @@ // #include "backend/cpu/CPUPool.hpp" -#include -#include -#include "core/Macro.h" - -#include "core/Concurrency.h" #include "math/Vec.hpp" using Vec4 = MNN::Math::Vec; - -static void pooling_max_pad(const float *channelInput, float *offsetOutput, int inputWidth, int inputHeight, - int inputStep4, int inputSize4, int kernelWidth, int kernelHeight, int iw, int ih) { - Vec4 max = Vec4(-FLT_MAX); - - const float *bottomLine = channelInput + inputSize4 - inputStep4; - for (int kh = 0; kh < kernelHeight; kh++) { - const int h = ih + kh; - const float *paddedLineInput = nullptr; - if (h < 0) { // top replicate - paddedLineInput = channelInput; - } else if (h >= inputHeight) { // bottom replicate - paddedLineInput = bottomLine; - } else { - paddedLineInput = channelInput + h * inputStep4; - } - - const float *rightEdge = paddedLineInput + inputStep4 - 4; - for (int kw = 0; kw < kernelWidth; kw++) { - const int w = iw + kw; - const float *cursorInput = nullptr; - if (w < 0) { // left replicate - cursorInput = paddedLineInput; - } else if (w >= inputWidth) { // right replicate - cursorInput = rightEdge; - } else { - cursorInput = paddedLineInput + 4 * w; - } - max = Vec4::max(max, Vec4::load(cursorInput)); - } - } - Vec4::save(offsetOutput, max); -} - -static void poolingMax(const float *channelInput, int inputWidth, int inputHeight, float *channelOutput, - int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, - int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) { - // Compute Mid Rect - int l = 0, t = 0, r = outputWidth, b = outputHeight; - for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) { - // do nothing - } - for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) { - // do nothing - } - for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) { - // do nothing - } - for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) { - // do nothing - } - int padTop = t, padBottom = b, padLeft = l, padRight = r; - - const int inputStep4 = 4 * inputWidth; - const int inputSize4 = inputStep4 * inputHeight; - const int strideInputStep4 = strideHeight * inputStep4; - const int outputStep4 = 4 * outputWidth; - const int strideWidth4 = 4 * strideWidth; - - { // handle paddings top - float *lineOutput = channelOutput; - for (int oh = 0, ih = -padHeight; oh < padTop; oh++, ih += strideHeight, lineOutput += outputStep4) { - float *offsetOutput = lineOutput; - for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) { - pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, - kernelWidth, kernelHeight, iw, ih); - } - } - for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; - oh++, ih += strideHeight, lineOutput += outputStep4) { - float *offsetOutput = lineOutput; - for (int ow = 0, iw = -padWidth; ow < padLeft; ow++, iw += strideWidth, offsetOutput += 4) { - pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, - kernelWidth, kernelHeight, iw, ih); - } - offsetOutput = lineOutput + padRight * 4; - for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth; - ow++, iw += strideWidth, offsetOutput += 4) { - pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, - kernelWidth, kernelHeight, iw, ih); - } - } - for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight; - oh++, ih += strideHeight, lineOutput += outputStep4) { - float *offsetOutput = lineOutput; - for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) { - pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, - kernelWidth, kernelHeight, iw, ih); - } - } - } - - { // handle no paddings - const float *lineInput = - channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4; - float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4; - int wCount = padRight - padLeft; - int wCountC4 = wCount / 4; - int wCountRemain = wCount - wCountC4 * 4; - int strideWidthFuse = strideWidth4 * 4; - - for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; - oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { - const float *offsetInput = lineInput; - float *offsetOutput = lineOutput; - for (int owf = 0; owf < wCountC4; ++owf, offsetOutput += 16, offsetInput += strideWidthFuse) { - Vec4 max0 = Vec4(-FLT_MAX); - Vec4 max1 = Vec4(-FLT_MAX); - Vec4 max2 = Vec4(-FLT_MAX); - Vec4 max3 = Vec4(-FLT_MAX); - const float *kernelInput = offsetInput; - for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { - const float *cursorInput = kernelInput; - for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { - max0 = Vec4::max(max0, Vec4::load(cursorInput + 0 * strideWidth4)); - max1 = Vec4::max(max1, Vec4::load(cursorInput + 1 * strideWidth4)); - max2 = Vec4::max(max2, Vec4::load(cursorInput + 2 * strideWidth4)); - max3 = Vec4::max(max3, Vec4::load(cursorInput + 3 * strideWidth4)); - } - } - Vec4::save(offsetOutput + 4 * 0, max0); - Vec4::save(offsetOutput + 4 * 1, max1); - Vec4::save(offsetOutput + 4 * 2, max2); - Vec4::save(offsetOutput + 4 * 3, max3); - } - for (int ow = 0; ow < wCountRemain; - ow++, offsetOutput += 4, offsetInput += strideWidth4) { - const float *kernelInput = offsetInput; - Vec4 max = Vec4(-FLT_MAX); - for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { - const float *cursorInput = kernelInput; - for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { - max = Vec4::max(max, Vec4::load(cursorInput)); - } - } - - Vec4::save(offsetOutput, max); - } - } - } -} - -static void poolingAvgPad(const float *offsetInput, float *offsetOutput, int inputWidth, int inputHeight, - int kernelWidth, int kernelHeight, int inputStep4, int iw, int ih, int padWidth, - int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) { - Vec4 sum = Vec4(0.0f); - - const int khs = 0 < -ih ? -ih : 0; // max - const int khe = kernelHeight < inputHeight - ih ? kernelHeight : inputHeight - ih; // min - const int kws = 0 < -iw ? -iw : 0; // max - const int kwe = kernelWidth < inputWidth - iw ? kernelWidth : inputWidth - iw; // min - - // sum - int count = 0; - if (countType == MNN::AvgPoolCountType_DEFAULT) { - if (padType == MNN::PoolPadType_CAFFE) { - countType = MNN::AvgPoolCountType_INCLUDE_PADDING; - } else { - countType = MNN::AvgPoolCountType_EXCLUDE_PADDING; - } - } - if (countType == MNN::AvgPoolCountType_INCLUDE_PADDING) { - count = (ALIMIN(ih + kernelHeight, inputHeight + padHeight) - ih) * - (ALIMIN(iw + kernelWidth, inputWidth + padWidth) - iw); - } else { - count = (khe - khs) * (kwe - kws); - } - - const float *kernelInput = offsetInput + khs * inputStep4; - for (int kh = khs; kh < khe; kh++, kernelInput += inputStep4) { - const float *cursorInput = kernelInput + kws * 4; - for (int kw = kws; kw < kwe; kw++, cursorInput += 4) { - sum = sum + Vec4::load(cursorInput); - } - } - - // avg - if (count > 0) { - Vec4 divs = Vec4(1.0f / count); - Vec4::save(offsetOutput, sum * divs); - } else { - Vec4::save(offsetOutput, Vec4(0.0f)); - } -} - -static void poolingAvg(const float *channelInput, int inputWidth, int inputHeight, float *channelOutput, - int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, - int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) { - // Compute Mid Rect - int l = 0, t = 0, r = outputWidth, b = outputHeight; - for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) { - // do nothing - } - for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) { - // do nothing - } - for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) { - // do nothing - } - for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) { - // do nothing - } - int padTop = t, padBottom = b, padLeft = l, padRight = r; - - - const int inputStep4 = 4 * inputWidth; - const int strideInputStep4 = strideHeight * inputStep4; - const int outputStep4 = 4 * outputWidth; - const int strideWidth4 = 4 * strideWidth; - - { // handle paddings - const float *lineInput = channelInput - padHeight * inputStep4 - padWidth * 4; - float *lineOutput = channelOutput; - for (int oh = 0, ih = -padHeight; oh < padTop; - oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { - const float *offsetInput = lineInput; - float *offsetOutput = lineOutput; - for (int ow = 0, iw = -padWidth; ow < outputWidth; - ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { - poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, - iw, ih, padWidth, padHeight, padType, countType); - } - } - for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; - oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { - const float *offsetInput = lineInput; - float *offsetOutput = lineOutput; - for (int ow = 0, iw = -padWidth; ow < padLeft; - ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { - poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, - iw, ih, padWidth, padHeight, padType, countType); - } - offsetInput = lineInput + padRight * strideWidth * 4; - offsetOutput = lineOutput + padRight * 4; - for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth; - ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { - poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, - iw, ih, padWidth, padHeight, padType, countType); - } - } - for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight; - oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { - const float *offsetInput = lineInput; - float *offsetOutput = lineOutput; - for (int ow = 0, iw = -padWidth; ow < outputWidth; - ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { - poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, - iw, ih, padWidth, padHeight, padType, countType); - } - } - } - - { // handle no paddings - const float *lineInput = - channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4; - float *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4; - - int count = kernelHeight * kernelWidth; - Vec4 divs = Vec4(1.0f / count); - for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; - oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { - const float *offsetInput = lineInput; - float *offsetOutput = lineOutput; - for (int ow = padLeft, iw = -padWidth + ow * strideWidth; ow < padRight; - ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { - Vec4 sum = Vec4(0.0f); - // sum - const float *kernelInput = offsetInput; - for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { - const float *cursorInput = kernelInput; - for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { - sum = sum + Vec4::load(cursorInput); - } - } - Vec4::save(offsetOutput, sum * divs); - } - } - } -} +using Vec16 = MNN::Math::Vec; namespace MNN { -CPUPool::CPUPool(Backend *b, const Pool *parameter) : MNN::Execution(b), mParameter(parameter) { - // nothing to do -} - -ErrorCode CPUPool::onResize(const std::vector &inputs, const std::vector &outputs) { - auto layer = mParameter; - int strideWidth = layer->strideX(); - int strideHeight = layer->strideY(); - int padWidth = layer->padX(); - int padHeight = layer->padY(); - - // edit const if global - auto input = inputs[0]; - auto output = outputs[0]; - int kernelWidth = layer->kernelX(); - int kernelHeight = layer->kernelY(); - if (layer->isGlobal()) { - kernelWidth = input->width(); - kernelHeight = input->height(); - strideWidth = input->width(); - strideHeight = input->height(); - padWidth = 0; - padHeight = 0; - } - if (layer->padType() == PoolPadType_SAME) { - int padNeededWidth = (output->width() - 1) * strideWidth + kernelWidth - input->width(); - int padNeededHeight = (output->height() - 1) * strideHeight + kernelHeight - input->height(); - padWidth = padNeededWidth > 0 ? padNeededWidth / 2 : 0; - padHeight = padNeededHeight > 0 ? padNeededHeight / 2 : 0; - } else if (layer->padType() == PoolPadType_VALID) { - padWidth = padHeight = 0; - } - auto poolType = layer->type(); - auto planeFunction = poolingMax; - if (poolType == PoolType_AVEPOOL) { - planeFunction = poolingAvg; - } - auto totalDepth = input->batch() * UP_DIV(input->channel(), 4); - auto inputData = input->host(); - auto outputData = output->host(); - auto inputPlaneStride = 4 * input->width() * input->height(); - auto outputPlaneStride = 4 * output->width() * output->height(); - int threadNumber = ((CPUBackend *)backend())->threadNumber(); - auto padType = layer->padType(); - auto countType = layer->countType(); - if (layer->pads() != nullptr && padType == PoolPadType_CAFFE) { - padType = PoolPadType_VALID; - } - mFunction = std::make_pair(threadNumber, [=](int tId) { - for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) { - // run - planeFunction(inputData + channel * inputPlaneStride, input->width(), input->height(), - outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth, - kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType, countType); - } - }); - return NO_ERROR; -} - -ErrorCode CPUPool::onExecute(const std::vector &inputs, const std::vector &outputs) { - MNN_CONCURRENCY_BEGIN(tId, mFunction.first) { - mFunction.second((int)tId); - } - MNN_CONCURRENCY_END(); - return NO_ERROR; -} class CPUPoolCreator : public CPUBackend::Creator { public: virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, const MNN::Op *op, Backend *backend) const override { - return new CPUPool(backend, op->main_as_Pool()); + if (inputs[0]->getType() == halide_type_of()) { + return new CPUPool(backend, op->main_as_Pool()); + } + return new CPUPool(backend, op->main_as_Pool()); } }; diff --git a/source/backend/cpu/CPUPool.hpp b/source/backend/cpu/CPUPool.hpp index b7f524cb..5d102044 100644 --- a/source/backend/cpu/CPUPool.hpp +++ b/source/backend/cpu/CPUPool.hpp @@ -10,14 +10,373 @@ #define CPUPool_hpp #include "backend/cpu/CPUBackend.hpp" +#include +#include +#include "core/Macro.h" + +#include "core/Concurrency.h" namespace MNN { + +template +static void pooling_max_pad(const T* channelInput, T* offsetOutput, int inputWidth, int inputHeight, + int inputStep4, int inputSize4, int kernelWidth, int kernelHeight, int iw, int ih) { + VEC max = VEC(-std::numeric_limits::max()); + + const T *bottomLine = channelInput + inputSize4 - inputStep4; + for (int kh = 0; kh < kernelHeight; kh++) { + const int h = ih + kh; + const T *paddedLineInput = nullptr; + if (h < 0) { // top replicate + paddedLineInput = channelInput; + } else if (h >= inputHeight) { // bottom replicate + paddedLineInput = bottomLine; + } else { + paddedLineInput = channelInput + h * inputStep4; + } + + const T *rightEdge = paddedLineInput + inputStep4 - 4; + for (int kw = 0; kw < kernelWidth; kw++) { + const int w = iw + kw; + const T *cursorInput = nullptr; + if (w < 0) { // left replicate + cursorInput = paddedLineInput; + } else if (w >= inputWidth) { // right replicate + cursorInput = rightEdge; + } else { + cursorInput = paddedLineInput + 4 * w; + } + max = VEC::max(max, VEC::load(cursorInput)); + } + } + VEC::save(offsetOutput, max); +} + +template +static void poolingMax(const T *channelInput, int inputWidth, int inputHeight, T *channelOutput, + int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, + int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) { + // Compute Mid Rect + int l = 0, t = 0, r = outputWidth, b = outputHeight; + for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) { + // do nothing + } + for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) { + // do nothing + } + for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) { + // do nothing + } + for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) { + // do nothing + } + int padTop = t, padBottom = b, padLeft = l, padRight = r; + + const int inputStep4 = 4 * inputWidth; + const int inputSize4 = inputStep4 * inputHeight; + const int strideInputStep4 = strideHeight * inputStep4; + const int outputStep4 = 4 * outputWidth; + const int strideWidth4 = 4 * strideWidth; + + { // handle paddings top + T *lineOutput = channelOutput; + for (int oh = 0, ih = -padHeight; oh < padTop; oh++, ih += strideHeight, lineOutput += outputStep4) { + T *offsetOutput = lineOutput; + for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) { + pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, + kernelWidth, kernelHeight, iw, ih); + } + } + for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; + oh++, ih += strideHeight, lineOutput += outputStep4) { + T *offsetOutput = lineOutput; + for (int ow = 0, iw = -padWidth; ow < padLeft; ow++, iw += strideWidth, offsetOutput += 4) { + pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, + kernelWidth, kernelHeight, iw, ih); + } + offsetOutput = lineOutput + padRight * 4; + for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth; + ow++, iw += strideWidth, offsetOutput += 4) { + pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, + kernelWidth, kernelHeight, iw, ih); + } + } + for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight; + oh++, ih += strideHeight, lineOutput += outputStep4) { + T *offsetOutput = lineOutput; + for (int ow = 0, iw = -padWidth; ow < outputWidth; ow++, iw += strideWidth, offsetOutput += 4) { + pooling_max_pad(channelInput, offsetOutput, inputWidth, inputHeight, inputStep4, inputSize4, + kernelWidth, kernelHeight, iw, ih); + } + } + } + + { // handle no paddings + const T *lineInput = + channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4; + T *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4; + int wCount = padRight - padLeft; + int wCountC4 = wCount / 4; + int wCountRemain = wCount - wCountC4 * 4; + int strideWidthFuse = strideWidth4 * 4; + + for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; + oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { + const T *offsetInput = lineInput; + T *offsetOutput = lineOutput; + for (int owf = 0; owf < wCountC4; ++owf, offsetOutput += 16, offsetInput += strideWidthFuse) { + VEC max0 = VEC(-std::numeric_limits::max()); + VEC max1 = VEC(-std::numeric_limits::max()); + VEC max2 = VEC(-std::numeric_limits::max()); + VEC max3 = VEC(-std::numeric_limits::max()); + const T *kernelInput = offsetInput; + for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { + const T *cursorInput = kernelInput; + for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { + max0 = VEC::max(max0, VEC::load(cursorInput + 0 * strideWidth4)); + max1 = VEC::max(max1, VEC::load(cursorInput + 1 * strideWidth4)); + max2 = VEC::max(max2, VEC::load(cursorInput + 2 * strideWidth4)); + max3 = VEC::max(max3, VEC::load(cursorInput + 3 * strideWidth4)); + } + } + VEC::save(offsetOutput + 4 * 0, max0); + VEC::save(offsetOutput + 4 * 1, max1); + VEC::save(offsetOutput + 4 * 2, max2); + VEC::save(offsetOutput + 4 * 3, max3); + } + for (int ow = 0; ow < wCountRemain; + ow++, offsetOutput += 4, offsetInput += strideWidth4) { + const T *kernelInput = offsetInput; + VEC max = VEC(-std::numeric_limits::max()); + for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { + const T *cursorInput = kernelInput; + for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { + max = VEC::max(max, VEC::load(cursorInput)); + } + } + + VEC::save(offsetOutput, max); + } + } + } +} + +template +static void poolingAvgPad(const T *offsetInput, T *offsetOutput, int inputWidth, int inputHeight, + int kernelWidth, int kernelHeight, int inputStep4, int iw, int ih, int padWidth, + int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) { + VEC sum = VEC(0.0f); + + const int khs = 0 < -ih ? -ih : 0; // max + const int khe = kernelHeight < inputHeight - ih ? kernelHeight : inputHeight - ih; // min + const int kws = 0 < -iw ? -iw : 0; // max + const int kwe = kernelWidth < inputWidth - iw ? kernelWidth : inputWidth - iw; // min + + // sum + int count = 0; + if (countType == MNN::AvgPoolCountType_DEFAULT) { + if (padType == MNN::PoolPadType_CAFFE) { + countType = MNN::AvgPoolCountType_INCLUDE_PADDING; + } else { + countType = MNN::AvgPoolCountType_EXCLUDE_PADDING; + } + } + if (countType == MNN::AvgPoolCountType_INCLUDE_PADDING) { + count = (ALIMIN(ih + kernelHeight, inputHeight + padHeight) - ih) * + (ALIMIN(iw + kernelWidth, inputWidth + padWidth) - iw); + } else { + count = (khe - khs) * (kwe - kws); + } + + const T *kernelInput = offsetInput + khs * inputStep4; + for (int kh = khs; kh < khe; kh++, kernelInput += inputStep4) { + const T *cursorInput = kernelInput + kws * 4; + for (int kw = kws; kw < kwe; kw++, cursorInput += 4) { + sum = sum + VEC::load(cursorInput); + } + } + + // avg + if (count > 0) { + VEC divs = VEC(1.0f / count); + VEC::save(offsetOutput, sum * divs); + } else { + VEC::save(offsetOutput, VEC(0.0f)); + } +} + +template +static void poolingAvg(const T* channelInput, int inputWidth, int inputHeight, T *channelOutput, + int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, + int strideHeight, int padWidth, int padHeight, MNN::PoolPadType padType, MNN::AvgPoolCountType countType) { + // Compute Mid Rect + int l = 0, t = 0, r = outputWidth, b = outputHeight; + for (; l * strideWidth - padWidth < 0 && l < outputWidth; l++) { + // do nothing + } + for (; t * strideHeight - padHeight < 0 && t < outputHeight; t++) { + // do nothing + } + for (; (r - 1) * strideWidth - padWidth + (kernelWidth - 1) >= inputWidth && r > l; r--) { + // do nothing + } + for (; (b - 1) * strideHeight - padHeight + (kernelHeight - 1) >= inputHeight && b > t; b--) { + // do nothing + } + int padTop = t, padBottom = b, padLeft = l, padRight = r; + + const int inputStep4 = 4 * inputWidth; + const int strideInputStep4 = strideHeight * inputStep4; + const int outputStep4 = 4 * outputWidth; + const int strideWidth4 = 4 * strideWidth; + + { // handle paddings + const T *lineInput = channelInput - padHeight * inputStep4 - padWidth * 4; + T *lineOutput = channelOutput; + for (int oh = 0, ih = -padHeight; oh < padTop; + oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { + const T *offsetInput = lineInput; + T *offsetOutput = lineOutput; + for (int ow = 0, iw = -padWidth; ow < outputWidth; + ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { + poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, + iw, ih, padWidth, padHeight, padType, countType); + } + } + for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; + oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { + const T *offsetInput = lineInput; + T *offsetOutput = lineOutput; + for (int ow = 0, iw = -padWidth; ow < padLeft; + ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { + poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, + iw, ih, padWidth, padHeight, padType, countType); + } + offsetInput = lineInput + padRight * strideWidth * 4; + offsetOutput = lineOutput + padRight * 4; + for (int ow = padRight, iw = -padWidth + ow * strideWidth; ow < outputWidth; + ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { + poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, + iw, ih, padWidth, padHeight, padType, countType); + } + } + for (int oh = padBottom, ih = -padHeight + oh * strideHeight; oh < outputHeight; + oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { + const T *offsetInput = lineInput; + T *offsetOutput = lineOutput; + for (int ow = 0, iw = -padWidth; ow < outputWidth; + ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { + poolingAvgPad(offsetInput, offsetOutput, inputWidth, inputHeight, kernelWidth, kernelHeight, inputStep4, + iw, ih, padWidth, padHeight, padType, countType); + } + } + } + + { // handle no paddings + const T *lineInput = + channelInput + (padTop * strideHeight - padHeight) * inputStep4 + (padLeft * strideWidth - padWidth) * 4; + T *lineOutput = channelOutput + padTop * outputStep4 + padLeft * 4; + + int count = kernelHeight * kernelWidth; + VEC divs = VEC(1.0f / count); + for (int oh = padTop, ih = -padHeight + oh * strideHeight; oh < padBottom; + oh++, ih += strideHeight, lineOutput += outputStep4, lineInput += strideInputStep4) { + const T *offsetInput = lineInput; + T *offsetOutput = lineOutput; + for (int ow = padLeft, iw = -padWidth + ow * strideWidth; ow < padRight; + ow++, iw += strideWidth, offsetOutput += 4, offsetInput += strideWidth4) { + VEC sum = VEC(0); + // sum + const T *kernelInput = offsetInput; + for (int kh = 0; kh < kernelHeight; kh++, kernelInput += inputStep4) { + const T *cursorInput = kernelInput; + for (int kw = 0; kw < kernelWidth; kw++, cursorInput += 4) { + sum = sum + VEC::load(cursorInput); + } + } + VEC::save(offsetOutput, sum * divs); + } + } + } +} + + +template class CPUPool : public Execution { public: - CPUPool(Backend *b, const Pool *parameter); + CPUPool(Backend *b, const Pool *parameter) : MNN::Execution(b), mParameter(parameter) { + // Do nothing + } virtual ~CPUPool() = default; - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override { + auto layer = mParameter; + int strideWidth = layer->strideX(); + int strideHeight = layer->strideY(); + int padWidth = layer->padX(); + int padHeight = layer->padY(); + + // edit const if global + auto input = inputs[0]; + auto output = outputs[0]; + int kernelWidth = layer->kernelX(); + int kernelHeight = layer->kernelY(); + if (layer->isGlobal()) { + kernelWidth = input->width(); + kernelHeight = input->height(); + strideWidth = input->width(); + strideHeight = input->height(); + padWidth = 0; + padHeight = 0; + } + if (layer->padType() == PoolPadType_SAME) { + int padNeededWidth = (output->width() - 1) * strideWidth + kernelWidth - input->width(); + int padNeededHeight = (output->height() - 1) * strideHeight + kernelHeight - input->height(); + padWidth = padNeededWidth > 0 ? padNeededWidth / 2 : 0; + padHeight = padNeededHeight > 0 ? padNeededHeight / 2 : 0; + } else if (layer->padType() == PoolPadType_VALID) { + padWidth = padHeight = 0; + } + auto poolType = layer->type(); + auto totalDepth = input->batch() * UP_DIV(input->channel(), 4); + auto inputData = input->host(); + auto outputData = output->host(); + auto inputPlaneStride = 4 * input->width() * input->height(); + auto outputPlaneStride = 4 * output->width() * output->height(); + int threadNumber = ((CPUBackend *)backend())->threadNumber(); + auto padType = layer->padType(); + auto countType = layer->countType(); + if (layer->pads() != nullptr && padType == PoolPadType_CAFFE) { + padType = PoolPadType_VALID; + } + if (poolType == PoolType_AVEPOOL) { + mFunction = std::make_pair(threadNumber, [=](int tId) { + for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) { + // run + poolingAvg(inputData + channel * inputPlaneStride, input->width(), input->height(), + outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth, + kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType, countType); + } + }); + } else { + mFunction = std::make_pair(threadNumber, [=](int tId) { + for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) { + // run + poolingMax(inputData + channel * inputPlaneStride, input->width(), input->height(), + outputData + outputPlaneStride * channel, output->width(), output->height(), kernelWidth, + kernelHeight, strideWidth, strideHeight, padWidth, padHeight, padType, countType); + } + }); + } + + return NO_ERROR; + } + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override { + MNN_CONCURRENCY_BEGIN(tId, mFunction.first) { + mFunction.second((int)tId); + } + MNN_CONCURRENCY_END(); + return NO_ERROR; + } private: const Pool *mParameter; diff --git a/source/backend/cpu/CPUPriorbox.cpp b/source/backend/cpu/CPUPriorbox.cpp deleted file mode 100644 index bf2bdddc..00000000 --- a/source/backend/cpu/CPUPriorbox.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// CPUPriorbox.cpp -// MNN -// -// Created by MNN on 2018/07/18. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPUPriorbox.hpp" -#include -#include "core/AutoStorage.h" -#include "backend/cpu/CPUBackend.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" -#include "core/TensorUtils.hpp" - -namespace MNN { - -CPUPriorBox::CPUPriorBox(Backend *b, const MNN::Op *op) : MNN::Execution(b) { - mParameter = op->main_as_PriorBox(); -} - -ErrorCode CPUPriorBox::onExecute(const std::vector &inputs, const std::vector &outputs) { - return NO_ERROR; -} -ErrorCode CPUPriorBox::onResize(const std::vector &inputs, const std::vector &outputs) { - AutoStorage mOutputData; - mOutputData.reset(outputs[0]->height() * outputs[0]->channel()); - - auto layer = mParameter; - auto input0 = inputs[0]; - const int w = input0->width(); - const int h = input0->height(); - - // image width, height - int imageW = layer->imageWidth(); - if (imageW <= 0) { - imageW = inputs[1]->width(); - } - int imageH = layer->imageHeight(); - if (imageH <= 0) { - imageH = inputs[1]->height(); - } - - // step width, height - float stepW = layer->stepWidth(); - if (stepW <= 0) { - stepW = (float)imageW / w; - } - float stepH = layer->stepHeight(); - if (stepH <= 0) { - stepH = (float)imageH / h; - } - - // sizes - auto minSizes = layer->minSizes(); - auto minSizeCount = minSizes ? minSizes->size() : 0; - auto maxSizes = layer->maxSizes(); - auto maxSizeCount = maxSizes ? maxSizes->size() : 0; - auto aspectRatios = layer->aspectRatios(); - bool flip = layer->flip(); - - std::vector aspectRatiosValue{1.0f}; - if (aspectRatios != nullptr) { - for (int i = 0; i < aspectRatios->size(); ++i) { - auto ratio = aspectRatios->data()[i]; - bool exist = false; - for (auto v : aspectRatiosValue) { - auto diff = v - ratio; - if (diff < 0) { - diff = -diff; - } - if (diff < 1e-6) { - exist = true; - break; - } - } - if (!exist) { - aspectRatiosValue.emplace_back(ratio); - if (flip) { - aspectRatiosValue.emplace_back(1.0f / ratio); - } - } - } - } - int priorCount = minSizeCount * aspectRatiosValue.size() + maxSizeCount; - - // boxes - float offset = layer->offset(); - auto boxesPtr = mOutputData.get(); - for (int i = 0; i < h; i++) { - float *box = boxesPtr + i * w * priorCount * 4; - float centerX = offset * stepW; - float centerY = offset * stepH + i * stepH; - for (int j = 0; j < w; j++, centerX += stepW) { - for (int k = 0; k < minSizeCount; k++) { - // min size box - float minSize = minSizes->data()[k]; - { - box[0] = (centerX - minSize * 0.5f) / imageW; - box[1] = (centerY - minSize * 0.5f) / imageH; - box[2] = (centerX + minSize * 0.5f) / imageW; - box[3] = (centerY + minSize * 0.5f) / imageH; - box += 4; - } - - // max size box - if (maxSizeCount > 0) { - float maxSize = maxSizes->data()[k]; - float ssqrt = sqrt(minSize * maxSize); - - box[0] = (centerX - ssqrt * 0.5f) / imageW; - box[1] = (centerY - ssqrt * 0.5f) / imageH; - box[2] = (centerX + ssqrt * 0.5f) / imageW; - box[3] = (centerY + ssqrt * 0.5f) / imageH; - box += 4; - } - - // aspect ratios - for (int p = 0; p < aspectRatiosValue.size(); p++) { - float arsqrt = sqrt(aspectRatiosValue[p]); - if (fabsf(arsqrt - 1.0f) < 1e-6) { - continue; - } - float boxW = minSize * arsqrt; - float boxH = minSize / arsqrt; - - box[0] = (centerX - boxW * 0.5f) / imageW; - box[1] = (centerY - boxH * 0.5f) / imageH; - box[2] = (centerX + boxW * 0.5f) / imageW; - box[3] = (centerY + boxH * 0.5f) / imageH; - box += 4; - } - } - } - } - - // clip - int oh = outputs[0]->height(); - if (layer->clip()) { - float *box = boxesPtr; - for (int i = 0; i < oh; i++) { - box[i] = std::min(std::max(box[i], 0.f), 1.f); - } - } - - // set variance - auto variances = layer->variances()->data(); - auto var = boxesPtr + oh; - for (int i = 0; i < oh / 4; i++) { - var[0] = variances[0]; - var[1] = variances[1]; - var[2] = variances[2]; - var[3] = variances[3]; - var += 4; - } - - // transform to output - auto output = outputs[0]; - MNNPackC4(output->host(), mOutputData.get(), output->height(), output->channel()); - return NO_ERROR; -} - -class CPUPriorBoxCreator : public CPUBackend::Creator { -public: - virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, - const MNN::Op *op, Backend *backend) const override { - return new CPUPriorBox(backend, op); - } -}; - -REGISTER_CPU_OP_CREATOR(CPUPriorBoxCreator, OpType_PriorBox); -} // namespace MNN diff --git a/source/backend/cpu/CPUPriorbox.hpp b/source/backend/cpu/CPUPriorbox.hpp deleted file mode 100644 index a6af14a2..00000000 --- a/source/backend/cpu/CPUPriorbox.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// -// CPUPriorbox.hpp -// MNN -// -// Created by MNN on 2018/07/18. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUPriorbox_hpp -#define CPUPriorbox_hpp - -#include "core/Execution.hpp" -#include "MNN_generated.h" - -namespace MNN { -class CPUPriorBox : public Execution { -public: - CPUPriorBox(Backend *b, const MNN::Op *op); - virtual ~CPUPriorBox() = default; - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - const MNN::PriorBox *mParameter; -}; - -} // namespace MNN -#endif /* CPUPriorbox_hpp */ diff --git a/source/backend/cpu/CPURank.cpp b/source/backend/cpu/CPURank.cpp deleted file mode 100644 index 9f28bcd5..00000000 --- a/source/backend/cpu/CPURank.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// -// CPURank.cpp -// MNN -// -// Created by MNN on 2018/08/22. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPURank.hpp" -#include "backend/cpu/CPUBackend.hpp" - -namespace MNN { - -CPURank::CPURank(Backend *backend) : Execution(backend) { - // nothing to do -} - -ErrorCode CPURank::onExecute(const std::vector &inputs, const std::vector &outputs) { - outputs[0]->host()[0] = inputs[0]->buffer().dimensions; - return NO_ERROR; -} - -class CPURankCreator : public CPUBackend::Creator { -public: - virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, - const MNN::Op *op, Backend *backend) const { - return new CPURank(backend); - } -}; - -REGISTER_CPU_OP_CREATOR(CPURankCreator, OpType_Rank); -} // namespace MNN diff --git a/source/backend/cpu/CPURank.hpp b/source/backend/cpu/CPURank.hpp deleted file mode 100644 index 6289ac74..00000000 --- a/source/backend/cpu/CPURank.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// -// CPURank.hpp -// MNN -// -// Created by MNN on 2018/08/22. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPURank_hpp -#define CPURank_hpp - -#include "core/Execution.hpp" - -namespace MNN { -class CPURank : public Execution { -public: - CPURank(Backend *backend); - virtual ~CPURank() = default; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; -}; - -} // namespace MNN - -#endif /* CPURank_hpp */ diff --git a/source/backend/cpu/CPURaster.cpp b/source/backend/cpu/CPURaster.cpp index 59d13818..ebef1473 100644 --- a/source/backend/cpu/CPURaster.cpp +++ b/source/backend/cpu/CPURaster.cpp @@ -61,43 +61,6 @@ static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& } } } -static bool _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) { - // TODO, may be wrong - if (region.offset != nullptr) { - return false; - } - auto origin = region.origin; - auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat; - auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat; - if (srcFormat == dstFormat) { - return false; - } - if (0 != region.src.offset || 0 != region.dst.offset) { - return false; - } - int dstBatch = 1, dstChannel = 1, dstArea = 1, - srcBatch = 1, srcChannel = 1, srcArea = 1; - getBatchChannelArea(origin, srcBatch, srcChannel, srcArea); - getBatchChannelArea(dest, dstBatch, dstChannel, dstArea); - if (dstBatch != srcBatch) { - return false; - } - if (dstChannel != srcChannel) { - return false; - } - if (dstArea != srcArea) { - return false; - } - auto totalSize = dstBatch * dstChannel * dstArea; - int srcSize = 1; - int dstSize = 1; - for (int i=0; i<3; ++i) { - srcSize += (region.size[i] - 1) * region.src.stride[i]; - dstSize += (region.size[i] - 1) * region.dst.stride[i]; - } - return srcSize == totalSize && dstSize == totalSize; -} - // Detect if the region is a transpose static bool _transpose(const Tensor::InsideDescribe::Region& region) { int srcOne = -1, dstOne = -1; @@ -118,6 +81,53 @@ static bool _transpose(const Tensor::InsideDescribe::Region& region) { return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne; } +static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) { + // TODO, may be wrong + if (region.offset != nullptr) { + return false; + } + auto origin = region.origin; + auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat; + auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat; + if (srcFormat == dstFormat) { + return 0; + } + if (0 != region.src.offset || 0 != region.dst.offset) { + return 0; + } + int dstBatch = 1, dstChannel = 1, dstArea = 1, + srcBatch = 1, srcChannel = 1, srcArea = 1; + getBatchChannelArea(origin, srcBatch, srcChannel, srcArea); + getBatchChannelArea(dest, dstBatch, dstChannel, dstArea); + if (dstBatch != srcBatch) { + return 0; + } + if (dstChannel != srcChannel) { + return 0; + } + if (dstArea != srcArea) { + return 0; + } + auto totalSize = dstBatch * dstChannel * dstArea; + int srcSize = 1; + int dstSize = 1; + int res = 1; + for (int i=0; i<3; ++i) { + if (region.size[i] == 1) { + continue; + } + if (region.src.stride[i] != region.dst.stride[i]) { + res = 2; + } + srcSize += (region.size[i] - 1) * region.src.stride[i]; + dstSize += (region.size[i] - 1) * region.dst.stride[i]; + } + if (srcSize != totalSize || dstSize != totalSize ) { + return 0; + } + return res; +} + ErrorCode CPURaster::onResize(const std::vector &inputs, const std::vector &outputs) { MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); @@ -161,14 +171,13 @@ ErrorCode CPURaster::onResize(const std::vector &inputs, const std::ve return NO_ERROR; } } - if (1 < static_cast(backend())->threadNumber()) { - mConverter.reset(new CPUTensorConverter(backend())); - } - mSingleConvert = false; + mSingleConvert = 0; // srcNum == 1 && srcFormat != dstFormat : Single Convert - if (des->regions.size() == 1 && _singleConvert(des->regions[0], output)) { - mSingleConvert = true; - return NO_ERROR; + if (des->regions.size() == 1) { + mSingleConvert = _singleConvert(des->regions[0], output); + if (mSingleConvert > 0) { + return NO_ERROR; + } } // input is NC4HW4 add Convert for (int i=0; i< des->regions.size(); ++i) { @@ -328,10 +337,13 @@ static void _1BitcopyWithStrideC4(uint8_t* dstO, const uint8_t* srcO, int size, void CPURaster::executeFaster(const std::vector &inputs, const std::vector &outputs) const { auto input = inputs[0]; auto output = outputs[0]; - auto bytes = input->getType().bytes(); + auto bytes = output->getType().bytes(); + if (mFixBytes > 0) { + bytes = mFixBytes; + } auto threadNum = static_cast(backend())->threadNumber(); if (mNeedZero) { - ::memset(output->host(), 0, output->size()); + ::memset(output->host(), 0, output->elementSize() * bytes); } auto C4proc = _1BitcopyWithStrideC4; switch (bytes) { @@ -425,6 +437,28 @@ static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const } } } +void CPURaster::tensorConvert(Tensor* input, Tensor* output, int bytes) { + auto& subIb = input->buffer(); + auto& subOb = output->buffer(); + auto source = TensorUtils::getDescribe(input)->dimensionFormat; + auto dest = TensorUtils::getDescribe(output)->dimensionFormat; + if (subIb.dimensions <= 1 || source == dest) { + ::memcpy(subOb.host, subIb.host, input->size()); + return; + } + auto tup = CPUTensorConverter::splitDimensions(subIb, source); + int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup); + const int bitLength = bytes; + + auto numberThread = ((CPUBackend*)backend())->threadNumber(); + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + for (int b = tId; b < batch; b+=numberThread) { + CPUTensorConverter::convert(subIb.host + b * bitLength * subIb.dim[0].stride, subOb.host + b * bitLength * subOb.dim[0].stride, source, dest, 1, area, channel, bitLength); + } + } + MNN_CONCURRENCY_END(); +} + ErrorCode CPURaster::onExecute(const std::vector &inputs, const std::vector &outputs) { if (mFast) { @@ -434,8 +468,12 @@ ErrorCode CPURaster::onExecute(const std::vector &inputs, const std::v auto input = inputs[0]; auto output = outputs[0]; auto bytes = input->getType().bytes(); + if (mFixBytes > 0) { + bytes = mFixBytes; + } + auto outputEleSize = output->elementSize(); auto threadNum = static_cast(backend())->threadNumber(); - if (mSingleConvert) { + if (mSingleConvert > 0) { auto realInput = TensorUtils::getDescribe(input)->regions[0].origin; int srcBatch = 1, srcChannel = 1, srcArea = 1; getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea); @@ -448,9 +486,15 @@ ErrorCode CPURaster::onExecute(const std::vector &inputs, const std::v int outputBatchStride = batchStride; if (MNN_DATA_FORMAT_NC4HW4 == sourceFormat) { inputBatchStride = batchStrideC4; + if (2 == mSingleConvert) { + destFormat = MNN_DATA_FORMAT_NHWC; + } } if (MNN_DATA_FORMAT_NC4HW4 == destFormat) { outputBatchStride = batchStrideC4; + if (2 == mSingleConvert) { + sourceFormat = MNN_DATA_FORMAT_NHWC; + } } MNN_CONCURRENCY_BEGIN(tId, threadNum) { for (int b=(int)tId; b &inputs, const std::v } if (mNeedZero) { if (mTempOutput == nullptr) { - ::memset(output->host(), 0, output->size()); + ::memset(output->host(), 0, outputEleSize * bytes); } else { - ::memset(mTempOutput->host(), 0, mTempOutput->size()); + ::memset(mTempOutput->host(), 0, mTempOutput->elementSize() * bytes); } } for (auto& iter : mTempInput) { - if (nullptr != mConverter) { - mConverter->onExecute({iter.first}, {iter.second.get()}); - } else { - CPUTensorConverter::convert(iter.first, iter.second.get()); - } + tensorConvert(iter.first, iter.second.get(), bytes); } auto proc = _1BitcopyWithStride; switch (bytes) { @@ -517,11 +557,7 @@ ErrorCode CPURaster::onExecute(const std::vector &inputs, const std::v } MNN_CONCURRENCY_END(); if (nullptr != mTempOutput) { - if (nullptr != mConverter) { - mConverter->onExecute({mTempOutput.get()}, {output}); - } else { - CPUTensorConverter::convert(mTempOutput.get(), output); - } + tensorConvert(mTempOutput.get(), output, bytes); } return NO_ERROR; } diff --git a/source/backend/cpu/CPURaster.hpp b/source/backend/cpu/CPURaster.hpp index e8195d73..c2d9b40a 100644 --- a/source/backend/cpu/CPURaster.hpp +++ b/source/backend/cpu/CPURaster.hpp @@ -13,8 +13,8 @@ namespace MNN { class CPURaster : public Execution { public: - CPURaster(Backend* bn) : Execution(bn) { - // Do nothing + CPURaster(Backend* bn, int fixBytes = 0) : Execution(bn) { + mFixBytes = fixBytes; } virtual ~ CPURaster() { // Do nothing @@ -23,16 +23,17 @@ public: virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; void executeFaster(const std::vector &inputs, const std::vector &outputs) const; + void tensorConvert(Tensor* input, Tensor* output, int bytes); private: std::map> mTempInput; std::vector> mTempInputCopy; std::vector> mFastBlit; std::shared_ptr mTempOutput; - std::shared_ptr mConverter; void* mOutputPtr; bool mNeedZero = false; bool mFast = false; - bool mSingleConvert = false; + int mSingleConvert = 0; + int mFixBytes; }; } #endif diff --git a/source/backend/cpu/CPURelu.cpp b/source/backend/cpu/CPURelu.cpp index 03cbe2e8..ad7c0003 100644 --- a/source/backend/cpu/CPURelu.cpp +++ b/source/backend/cpu/CPURelu.cpp @@ -18,6 +18,30 @@ ErrorCode CPURelu::onExecute(const std::vector& inputs, const std::vect auto& ib = inputs[0]->buffer(); auto& ob = outputs[0]->buffer(); + if (inputs[0]->getType() == halide_type_of()) { + const int8_t* srcO = (const int8_t*)ib.host; + int8_t* dstO = (int8_t*)ob.host; + auto size = inputs[0]->size() / sizeof(int8_t); + auto numberThread = ((CPUBackend*)backend())->threadNumber(); + int sizeQuad = size / 16; + int remain = sizeQuad * 16; + int sizeDivide = sizeQuad / numberThread; + if (sizeQuad > 0) { + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + int number = sizeDivide; + if (tId == numberThread - 1) { + number = sizeQuad - tId * sizeDivide; + } + MNNReluInt8(dstO + 16 * tId * sizeDivide, srcO + 16 * tId * sizeDivide, number * 16); + } + MNN_CONCURRENCY_END(); + } + for (int i = remain; i < size; i++) { + dstO[i] = srcO[i] > 0 ? srcO[i] : 0; + } + return NO_ERROR; + } + const float* srcO = (const float*)ib.host; float* dstO = (float*)ob.host; auto size = inputs[0]->size() / sizeof(float); @@ -62,7 +86,7 @@ ErrorCode CPURelu6::onExecute(const std::vector& inputs, const std::vec if (tId == numberThread - 1) { number = sizeQuad - tId * sizeDivide; } - MNNAxByClampBroadcastC4(dstO + tId * sizeDivide * 4, srcO + tId * sizeDivide * 4, bias.data(), number, 0, 0, 1, mParam.data()); + MNNAxByClampBroadcastUnit(dstO + tId * sizeDivide * 4, srcO + tId * sizeDivide * 4, bias.data(), number, 0, 0, 1, mParam.data()); } MNN_CONCURRENCY_END(); MNNAxByClamp(dstO + remain, srcO + remain, srcO + remain, size - remain, 0, 0, 0, 1, mParam.data()); diff --git a/source/backend/cpu/CPUReluGrad.cpp b/source/backend/cpu/CPUReluGrad.cpp deleted file mode 100644 index 4d336346..00000000 --- a/source/backend/cpu/CPUReluGrad.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// -// CPUReluGrad.cpp -// MNN -// -// Created by MNN on 2019/04/18. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "CPUReluGrad.hpp" -#include "core/Concurrency.h" -#include "CPUBackend.hpp" -namespace MNN { -ErrorCode CPUReluGrad::onExecute(const std::vector& inputs, const std::vector& outputs) { - MNN_ASSERT(0 == mSlope); - auto reluOrigin = inputs[0]; - auto reluDiff = inputs[1]; - auto outputDiff = outputs[0]; - auto size = outputDiff->elementSize(); - - auto reluOriginPtr = reluOrigin->host(); - auto reluDiffPtr = reluDiff->host(); - auto outputDiffPtr = outputDiff->host(); - auto numberThread = ((CPUBackend*)backend())->threadNumber(); - MNN_CONCURRENCY_BEGIN(tId, numberThread) { - for (int n = tId; n < size; n+=numberThread) { - if (reluOriginPtr[n] > 0.0f) { - outputDiffPtr[n] = reluDiffPtr[n]; - } else { - outputDiffPtr[n] = 0.0f; - } - } - } - MNN_CONCURRENCY_END(); - - return NO_ERROR; -} -class CPURelu6Grad : public Execution { -public: - CPURelu6Grad(Backend *bn) : Execution(bn) { - //Do nothing - } - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override { - auto reluOrigin = inputs[0]; - auto reluDiff = inputs[1]; - auto outputDiff = outputs[0]; - auto size = outputDiff->elementSize(); - - auto reluOriginPtr = reluOrigin->host(); - auto reluDiffPtr = reluDiff->host(); - auto outputDiffPtr = outputDiff->host(); - auto numberThread = ((CPUBackend*)backend())->threadNumber(); - MNN_CONCURRENCY_BEGIN(tId, numberThread) { - for (int n = tId; n < size; n+=numberThread) { - if (reluOriginPtr[n] > 0.0f && reluOriginPtr[n] <= 6.0f) { - outputDiffPtr[n] = reluDiffPtr[n]; - } else { - outputDiffPtr[n] = 0.0f; - } - } - } - MNN_CONCURRENCY_END(); - return NO_ERROR; - } -}; -class CPUReluGradCreator : public CPUBackend::Creator { -public: - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const override { - if (op->type() == OpType_ReluGrad) { - auto slope = op->main_as_Relu()->slope(); - return new CPUReluGrad(slope, backend); - } - if (op->type() == OpType_Relu6Grad) { - return new CPURelu6Grad(backend); - } - return nullptr; - } -}; - -REGISTER_CPU_OP_CREATOR(CPUReluGradCreator, OpType_ReluGrad); -REGISTER_CPU_OP_CREATOR(CPUReluGradCreator, OpType_Relu6Grad); -} // namespace MNN diff --git a/source/backend/cpu/CPUReluGrad.hpp b/source/backend/cpu/CPUReluGrad.hpp deleted file mode 100644 index 003ad6d6..00000000 --- a/source/backend/cpu/CPUReluGrad.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// -// CPUReluGrad.hpp -// MNN -// -// Created by MNN on 2019/04/18. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUReluGrad_hpp -#define CPUReluGrad_hpp - -#include "backend/cpu/CPUBackend.hpp" -namespace MNN { -class CPUReluGrad : public Execution { -public: - CPUReluGrad(float slope, Backend *bn) : Execution(bn), mSlope(slope) { - } - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - float mSlope = 0.0f; -}; -} // namespace MNN - -#endif /* CPUReluGrad_hpp */ diff --git a/source/backend/cpu/CPURuntime.cpp b/source/backend/cpu/CPURuntime.cpp index 37c518a7..3f5c23ca 100644 --- a/source/backend/cpu/CPURuntime.cpp +++ b/source/backend/cpu/CPURuntime.cpp @@ -15,7 +15,7 @@ #include #endif -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) #ifdef __ANDROID__ #include @@ -274,7 +274,7 @@ float MNNGetCPUFlops(uint32_t number) { // cpuinfo // Reference from: https://github.com/pytorch/cpuinfo -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) #ifdef __ANDROID__ @@ -299,9 +299,14 @@ float MNNGetCPUFlops(uint32_t number) { #define CPUINFO_ARM_MIDR_PART_OFFSET 4 #define CPUINFO_ARM_MIDR_REVISION_OFFSET 0 +#ifdef __aarch64__ #define CPUINFO_ARM_LINUX_FEATURE_FPHP UINT32_C(0x00000200) #define CPUINFO_ARM_LINUX_FEATURE_ASIMDHP UINT32_C(0x00000400) #define CPUINFO_ARM_LINUX_FEATURE_ASIMDDP UINT32_C(0x00100000) +#else +#define CPUINFO_ARM_LINUX_FEATURE_HALF UINT32_C(0x00000002) +#define CPUINFO_ARM_LINUX_FEATURE_NEON UINT32_C(0x00001000) +#endif struct cpuinfo_arm_linux_processor { uint32_t architecture_version; @@ -349,6 +354,10 @@ inline static uint32_t midr_set_variant(uint32_t midr, uint32_t variant) { ((variant << CPUINFO_ARM_MIDR_VARIANT_OFFSET) & CPUINFO_ARM_MIDR_VARIANT_MASK); } +inline static uint32_t midr_get_variant(uint32_t midr) { + return (midr & CPUINFO_ARM_MIDR_VARIANT_MASK) >> CPUINFO_ARM_MIDR_VARIANT_OFFSET; +} + uint32_t cpuinfo_arm_linux_hwcap_from_getauxval(void) { return (uint32_t)getauxval(AT_HWCAP); } @@ -1326,13 +1335,15 @@ void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa) { cpuinfo_isa->dot = true; break; default: +#ifdef __aarch64__ if (isa_features & CPUINFO_ARM_LINUX_FEATURE_ASIMDDP) { cpuinfo_isa->dot = true; } +#endif // TODO, whitelist, ex: hisilicon_kirin 980... break; } - +#ifdef __aarch64__ const uint32_t fp16arith_mask = CPUINFO_ARM_LINUX_FEATURE_FPHP | CPUINFO_ARM_LINUX_FEATURE_ASIMDHP; if ((isa_features & fp16arith_mask) == fp16arith_mask) { if (chipset.series == cpuinfo_arm_chipset_series_samsung_exynos && chipset.model == 9810) { @@ -1341,6 +1352,71 @@ void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa) { cpuinfo_isa->fp16arith = true; } } +#else + // pytorch/cpuinfo: src/arm/linux/aarch32-isa.c + uint32_t architecture_version = 0; + if (processors_count > 0) { + architecture_version = arm_linux_processors[0].architecture_version; + } + if (architecture_version >= 8) { + /* + * NEON FP16 compute extension and VQRDMLAH/VQRDMLSH instructions are not indicated in /proc/cpuinfo. + * Use a MIDR-based heuristic to whitelist processors known to support it: + * - Processors with Cortex-A55 cores + * - Processors with Cortex-A65 cores + * - Processors with Cortex-A75 cores + * - Processors with Cortex-A76 cores + * - Processors with Cortex-A77 cores + * - Processors with Exynos M4 cores + * - Processors with Exynos M5 cores + * - Neoverse N1 cores + */ + if (chipset.series == cpuinfo_arm_chipset_series_samsung_exynos && chipset.model == 9810) { + /* Only little cores of Exynos 9810 support FP16 & RDM */ + MNN_PRINT("FP16 arithmetics and RDM disabled: only little cores in Exynos 9810 support these extensions"); + } else { + switch (last_midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) { + case UINT32_C(0x4100D050): /* Cortex-A55 */ + case UINT32_C(0x4100D060): /* Cortex-A65 */ + case UINT32_C(0x4100D0B0): /* Cortex-A76 */ + case UINT32_C(0x4100D0C0): /* Neoverse N1 */ + case UINT32_C(0x4100D0D0): /* Cortex-A77 */ + case UINT32_C(0x4100D0E0): /* Cortex-A76AE */ + case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */ + case UINT32_C(0x51008020): /* Kryo 385 Gold (Cortex-A75) */ + case UINT32_C(0x51008030): /* Kryo 385 Silver (Cortex-A55) */ + case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */ + case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */ + case UINT32_C(0x53000030): /* Exynos M4 */ + case UINT32_C(0x53000040): /* Exynos M5 */ + cpuinfo_isa->fp16arith = true; + break; + } + } + /* + * NEON VDOT instructions are not indicated in /proc/cpuinfo. + * Use a MIDR-based heuristic to whitelist processors known to support it. + */ + switch (last_midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) { + case UINT32_C(0x4100D0B0): /* Cortex-A76 */ + case UINT32_C(0x4100D0D0): /* Cortex-A77 */ + case UINT32_C(0x4100D0E0): /* Cortex-A76AE */ + case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */ + case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */ + case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */ + case UINT32_C(0x53000030): /* Exynos-M4 */ + case UINT32_C(0x53000040): /* Exynos-M5 */ + cpuinfo_isa->dot = true; + break; + case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */ + cpuinfo_isa->dot = (midr_get_variant(last_midr) >= 1); + break; + case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */ + cpuinfo_isa->dot = (midr_get_variant(last_midr) >= 2); + break; + } + } +#endif #endif // #ifdef __ANDROID__ diff --git a/source/backend/cpu/CPURuntime.hpp b/source/backend/cpu/CPURuntime.hpp index 0aece70b..2c2890b0 100644 --- a/source/backend/cpu/CPURuntime.hpp +++ b/source/backend/cpu/CPURuntime.hpp @@ -9,7 +9,7 @@ #ifndef CPURuntime_hpp #define CPURuntime_hpp -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) struct cpuinfo_arm_isa { bool fp16arith; bool dot; @@ -131,7 +131,7 @@ int MNNSetCPUThreadsMode(MNNCPUThreadsMode mode); // float MNNGetCPUFlops(uint32_t number); -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa); diff --git a/source/backend/cpu/CPUScale.cpp b/source/backend/cpu/CPUScale.cpp index 14f68e93..99de954f 100644 --- a/source/backend/cpu/CPUScale.cpp +++ b/source/backend/cpu/CPUScale.cpp @@ -8,18 +8,19 @@ #include "CPUScale.hpp" #include "CPUBackend.hpp" -#include "compute/CommonOptFunction.h" #include "core/Macro.h" #include "core/TensorUtils.hpp" #include "core/Concurrency.h" +#include "compute/CommonOptFunction.h" namespace MNN { CPUScale::CPUScale(const Op* op, Backend* bn) : MNN::Execution(bn) { auto scale = op->main_as_Scale(); int outputCount = scale->scaleData()->size(); + auto core = static_cast(bn)->functions(); mScaleBias.reset( - Tensor::createDevice( - {2, ALIGN_UP4(outputCount)} + Tensor::createDevice( + {2, UP_DIV(outputCount, core->pack) * core->pack * core->bytes} )); auto res = bn->onAcquireBuffer(mScaleBias.get(), Backend::STATIC); if (!res) { @@ -29,9 +30,17 @@ CPUScale::CPUScale(const Op* op, Backend* bn) : MNN::Execution(bn) { return; } ::memset(mScaleBias->host(), 0, mScaleBias->size()); - ::memcpy(mScaleBias->host(), scale->scaleData()->data(), outputCount * sizeof(float)); + if (core->bytes < 4) { + core->MNNFp32ToLowp(scale->scaleData()->data(), mScaleBias->host(), outputCount); + } else { + ::memcpy(mScaleBias->host(), scale->scaleData()->data(), outputCount * sizeof(float)); + } if (nullptr != scale->biasData() && nullptr != scale->biasData()->data()) { - ::memcpy(mScaleBias->host() + ALIGN_UP4(outputCount), scale->biasData()->data(), outputCount * sizeof(float)); + if (core->bytes < 4) { + core->MNNFp32ToLowp(scale->biasData()->data(), (int16_t*)(mScaleBias->host() + 1 * mScaleBias->length(1)), outputCount); + } else { + ::memcpy(mScaleBias->host() + ALIGN_UP4(outputCount), scale->biasData()->data(), outputCount * sizeof(float)); + } } } CPUScale::~CPUScale() { @@ -42,35 +51,27 @@ CPUScale::~CPUScale() { ErrorCode CPUScale::onExecute(const std::vector& inputs, const std::vector& outputs) { auto input = inputs[0]; auto output = outputs[0]; - auto scalePtr = mScaleBias->host(); - auto biasPtr = mScaleBias->host() + 1 * mScaleBias->length(1); + auto core = static_cast(backend())->functions(); + auto scalePtr = mScaleBias->host(); + auto biasPtr = mScaleBias->host() + 1 * mScaleBias->length(1); //FUNC_PRINT(TensorUtils::getDescribe(input)->dimensionFormat); - if (TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) { - auto batch = input->buffer().dim[0].extent; - auto depthQuad = UP_DIV(input->channel(), 4); - int planeNumber = 1; - for (int i = 2; i < input->buffer().dimensions; ++i) { - planeNumber *= input->length(i); - } - auto depthStride = planeNumber * 4; - auto totalDepth = batch * depthQuad; - int numberThread = ((CPUBackend*)backend())->threadNumber(); - MNN_CONCURRENCY_BEGIN(tId, numberThread) { - for (int i = tId; i < totalDepth; i+=numberThread) { - auto depthIndex = i % depthQuad; - MNNScaleAndAddBias(output->host() + depthStride * i, input->host() + depthStride * i, biasPtr + 4 * depthIndex, - scalePtr + 4 * depthIndex, planeNumber, 1); - } - } - MNN_CONCURRENCY_END(); - return NO_ERROR; + auto batch = input->buffer().dim[0].extent; + auto depthQuad = UP_DIV(input->channel(), core->pack); + int planeNumber = 1; + for (int i = 2; i < input->buffer().dimensions; ++i) { + planeNumber *= input->length(i); } - MNN_ASSERT(TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NHWC); - - auto channel = input->channel(); - auto outside = input->elementSize() / channel; - MNNScaleAndAddBiasOutside(output->host(), input->host(), biasPtr, scalePtr, outside, channel); - + auto depthStride = planeNumber * core->pack; + auto totalDepth = batch * depthQuad; + int numberThread = ((CPUBackend*)backend())->threadNumber(); + MNN_CONCURRENCY_BEGIN(tId, numberThread) { + for (int i = tId; i < totalDepth; i+=numberThread) { + auto depthIndex = i % depthQuad; + core->MNNScaleAndAddBias((float*)(output->host() + depthStride * i * core->bytes), (const float*)(input->host() + depthStride * i * core->bytes), (const float*)(biasPtr + core->pack * core->bytes * depthIndex), + (const float*)(scalePtr + core->pack * core->bytes * depthIndex), planeNumber, 1); + } + } + MNN_CONCURRENCY_END(); return NO_ERROR; } class CPUScaleCreator : public CPUBackend::Creator { diff --git a/source/backend/cpu/CPUShape.cpp b/source/backend/cpu/CPUShape.cpp deleted file mode 100644 index 8db33e51..00000000 --- a/source/backend/cpu/CPUShape.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// -// CPUShape.cpp -// MNN -// -// Created by MNN on 2018/08/15. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPUShape.hpp" -#include "backend/cpu/CPUBackend.hpp" -#include "core/Macro.h" -#include "core/TensorUtils.hpp" -namespace MNN { - -ErrorCode CPUShape::onExecute(const std::vector& inputs, const std::vector& outputs) { - auto& ib = inputs[0]->buffer(); - int32_t* outData = outputs[0]->host(); - auto inputFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat; - if ((inputFormat == MNN_DATA_FORMAT_NC4HW4) && TensorUtils::getDescribe(outputs[0])->dimensionFormat == MNN_DATA_FORMAT_NHWC) { - outData[0] = ib.dim[0].extent; - outData[1] = ib.dim[2].extent; - outData[2] = ib.dim[3].extent; - outData[3] = ib.dim[1].extent; - } else { - for (int i = 0; i < ib.dimensions; i++) { - outData[i] = ib.dim[i].extent; - } - } - return NO_ERROR; -} - -class CPUShapeCreator : public CPUBackend::Creator { -public: - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const override { - return new CPUShape(backend); - } -}; - -REGISTER_CPU_OP_CREATOR(CPUShapeCreator, OpType_Shape); -} // namespace MNN diff --git a/source/backend/cpu/CPUShape.hpp b/source/backend/cpu/CPUShape.hpp deleted file mode 100644 index 8ef8be94..00000000 --- a/source/backend/cpu/CPUShape.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// -// CPUShape.hpp -// MNN -// -// Created by MNN on 2018/08/15. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUShape_hpp -#define CPUShape_hpp - -#include "core/Execution.hpp" - -namespace MNN { -class CPUShape : public Execution { -public: - CPUShape(Backend *b) : Execution(b) { - // nothing to do - } - virtual ~CPUShape() = default; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; -}; -} // namespace MNN - -#endif /* CPUShape_hpp */ diff --git a/source/backend/cpu/CPUSigmoid.cpp b/source/backend/cpu/CPUSigmoid.cpp deleted file mode 100644 index 7a85f176..00000000 --- a/source/backend/cpu/CPUSigmoid.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// -// CPUSigmoid.cpp -// MNN -// -// Created by MNN on 2018/08/09. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPUSigmoid.hpp" -#include -#include "backend/cpu/CPUBackend.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" -#include "core/Macro.h" - -namespace MNN { -ErrorCode CPUSigmoid::onExecute(const std::vector& inputs, const std::vector& outputs) { - MNN_ASSERT(1 == inputs.size()); - MNN_ASSERT(1 == outputs.size()); - auto inputData = inputs[0]->host(); - auto outputData = outputs[0]->host(); - - const int dataSize = outputs[0]->elementSize(); - MNNExp(outputData, inputData, dataSize); - for (int i = 0; i < dataSize; ++i) { - outputData[i] = 1.0f / (1.0f + outputData[i]); - } - return NO_ERROR; -} - -class CPUSigmoidCreator : public CPUBackend::Creator { -public: - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const { - return new CPUSigmoid(backend); - } -}; - -REGISTER_CPU_OP_CREATOR(CPUSigmoidCreator, OpType_Sigmoid); -} // namespace MNN diff --git a/source/backend/cpu/CPUSigmoid.hpp b/source/backend/cpu/CPUSigmoid.hpp deleted file mode 100644 index f5924cea..00000000 --- a/source/backend/cpu/CPUSigmoid.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// -// CPUSigmoid.hpp -// MNN -// -// Created by MNN on 2018/08/09. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUSigmoid_hpp -#define CPUSigmoid_hpp - -#include "core/Execution.hpp" - -namespace MNN { -class CPUSigmoid : public Execution { -public: - CPUSigmoid(Backend *b) : Execution(b) { - // nothing to do - } - virtual ~CPUSigmoid() = default; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; -}; -} // namespace MNN - -#endif /* CPUSigmoid_hpp */ diff --git a/source/backend/cpu/CPUSize.cpp b/source/backend/cpu/CPUSize.cpp deleted file mode 100644 index c88addec..00000000 --- a/source/backend/cpu/CPUSize.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// -// CPUSize.cpp -// MNN -// -// Created by MNN on 2018/08/23. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPUSize.hpp" -#include "backend/cpu/CPUBackend.hpp" - -namespace MNN { - -template -CPUSize::CPUSize(Backend *backend, const Op *op) : Execution(backend) { - // nothing to do -} - -template -ErrorCode CPUSize::onExecute(const std::vector &inputs, const std::vector &outputs) { - int count = 1; - for (int i = 0; i < inputs[0]->buffer().dimensions; i++) { - count *= inputs[0]->buffer().dim[i].extent; - } - outputs[0]->host()[0] = count; - return NO_ERROR; -} - -class CPUSizeCreator : public CPUBackend::Creator { -public: - virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, - const MNN::Op *op, Backend *backend) const { - return new CPUSize(backend, op); - } -}; - -REGISTER_CPU_OP_CREATOR(CPUSizeCreator, OpType_Size); -} // namespace MNN diff --git a/source/backend/cpu/CPUSize.hpp b/source/backend/cpu/CPUSize.hpp deleted file mode 100644 index 4a42b5cd..00000000 --- a/source/backend/cpu/CPUSize.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// -// CPUSize.hpp -// MNN -// -// Created by MNN on 2018/08/23. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUSize_hpp -#define CPUSize_hpp - -#include "core/Execution.hpp" - -namespace MNN { -template -class CPUSize : public Execution { -public: - CPUSize(Backend *backend, const Op *op); - virtual ~CPUSize() = default; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; -}; - -} // namespace MNN -#endif /* CPUSize_hpp */ diff --git a/source/backend/cpu/CPUSoftmaxGrad.cpp b/source/backend/cpu/CPUSoftmaxGrad.cpp deleted file mode 100644 index 7f9bb11a..00000000 --- a/source/backend/cpu/CPUSoftmaxGrad.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// -// CPUSoftmaxGrad.cpp -// MNN -// -// Created by MNN on 2019/04/18. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPUSoftmaxGrad.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" -#include "backend/cpu/compute/ConvOpt.h" -#include "core/Macro.h" -#include "core/TensorUtils.hpp" -#include "math/Vec.hpp" -using Vec4 = MNN::Math::Vec; -namespace MNN { -ErrorCode CPUSoftmaxGrad::onExecute(const std::vector& inputs, const std::vector& outputs) { - MNN_ASSERT(1 == mAxis); - auto softmax = inputs[0]; - auto gradSoftmax = inputs[1]; - auto gradX = outputs[0]; - auto gradXPtr = gradX->host(); - auto softmaxPtr = softmax->host(); - auto gradSoftmaxPtr = gradSoftmax->host(); - auto batch = softmax->length(0); - if (TensorUtils::getDescribe(gradX)->dimensionFormat == MNN_DATA_FORMAT_NHWC || TensorUtils::getDescribe(gradX)->dimensionFormat == MNN_DATA_FORMAT_NCHW) { - // NHWC - auto channel = softmax->length(1); - MNN_ASSERT(channel > 0); - for (int i = 0; i < batch; ++i) { - auto s0 = softmaxPtr + i * channel; - auto s1 = gradSoftmaxPtr + i * channel; - - auto dst = gradXPtr + i * channel; - float sumV = 0.0f; - for (int j = 0; j < channel; ++j) { - sumV = sumV + s1[j] * s0[j]; - } - for (int j = 0; j < channel; ++j) { - dst[j] = s0[j] * (s1[j] - sumV); - } - } - return NO_ERROR; - } - auto channel = softmax->channel(); - auto channelC4 = channel / 4; - auto channelAlign = ALIGN_UP4(channel); - auto channelRemain = channelC4 * 4; - - for (int i = 0; i < batch; ++i) { - auto s0 = softmaxPtr + i * channelAlign; - auto s1 = gradSoftmaxPtr + i * channelAlign; - - auto dst = gradXPtr + i * channelAlign; - ::memset(dst, 0, channelAlign * sizeof(float)); - Vec4 sumV(0.0f); - for (int j = 0; j < channelC4; ++j) { - sumV = sumV + Vec4::load(s1 + 4 * j) * Vec4::load(s0 + 4 * j); - } - float sum = sumV[0] + sumV[1] + sumV[2] + sumV[3]; - for (int j = channelRemain; j < channel; ++j) { - sum += s1[j] * s0[j]; - } - sumV = Vec4(sum); - for (int j = 0; j < channelC4; ++j) { - Vec4::save(dst + 4 * j, Vec4::load(s0 + 4 * j) * (Vec4::load(s1 + 4 * j) - sumV)); - } - for (int j = channelRemain; j < channel; ++j) { - dst[j] = s0[j] * (s1[j] - sum); - } - } - return NO_ERROR; -} -class CPUSoftmaxGradCreator : public CPUBackend::Creator { -public: - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const override { - auto axis = op->main_as_Axis()->axis(); - if (axis < 0) { - axis = inputs[0]->dimensions() + axis; - } - return new CPUSoftmaxGrad(axis, backend); - } -}; - -REGISTER_CPU_OP_CREATOR(CPUSoftmaxGradCreator, OpType_SoftmaxGrad); - -} // namespace MNN diff --git a/source/backend/cpu/CPUSoftmaxGrad.hpp b/source/backend/cpu/CPUSoftmaxGrad.hpp deleted file mode 100644 index e3f9ea4b..00000000 --- a/source/backend/cpu/CPUSoftmaxGrad.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// -// CPUSoftmaxGrad.hpp -// MNN -// -// Created by MNN on 2019/04/18. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUSoftmaxGrad_hpp -#define CPUSoftmaxGrad_hpp - -#include "backend/cpu/CPUBackend.hpp" - -namespace MNN { -class CPUSoftmaxGrad : public Execution { -public: - CPUSoftmaxGrad(int axis, Backend *bn) : Execution(bn), mAxis(axis) { - } - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - int mAxis = 1; -}; -} // namespace MNN - -#endif /* CPUSoftmaxGrad_hpp */ diff --git a/source/backend/cpu/CPUTanh.cpp b/source/backend/cpu/CPUTanh.cpp deleted file mode 100644 index dd8e8424..00000000 --- a/source/backend/cpu/CPUTanh.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// -// CPUTanh.cpp -// MNN -// -// Created by MNN on 2018/08/27. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/CPUTanh.hpp" -#include -#include "backend/cpu/compute/CommonOptFunction.h" -#include "backend/cpu/CPUBackend.hpp" -#include "core/Macro.h" - -namespace MNN { - -ErrorCode CPUTanh::onExecute(const std::vector &inputs, const std::vector &outputs) { - MNN_ASSERT(1 == inputs.size()); - MNN_ASSERT(1 == outputs.size()); - auto inputData = inputs[0]->host(); - auto outputData = outputs[0]->host(); - - const int dataSize = outputs[0]->elementSize(); - MNNTanh(outputData, inputData, dataSize); - return NO_ERROR; -} -} // namespace MNN diff --git a/source/backend/cpu/CPUTanh.hpp b/source/backend/cpu/CPUTanh.hpp deleted file mode 100644 index b57bcd29..00000000 --- a/source/backend/cpu/CPUTanh.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// -// CPUTanh.hpp -// MNN -// -// Created by MNN on 2018/08/27. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef CPUTanh_hpp -#define CPUTanh_hpp - -#include "core/Execution.hpp" - -namespace MNN { -class CPUTanh : public Execution { -public: - CPUTanh(Backend *b) : Execution(b) { - // nothing to do - } - virtual ~CPUTanh() = default; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; -}; -} // namespace MNN - -#endif // CPUTanh_hpp diff --git a/source/backend/cpu/CPUTensorConvert.cpp b/source/backend/cpu/CPUTensorConvert.cpp index d80b4e35..1f2af143 100644 --- a/source/backend/cpu/CPUTensorConvert.cpp +++ b/source/backend/cpu/CPUTensorConvert.cpp @@ -25,6 +25,16 @@ static void _NC4HW42NHWCUint8(const uint8_t* source, uint8_t* dest, int b, int c } } +static void _NC4HW42NHWCInt16(const int16_t* source, int16_t* dest, int b, int c, int area) { + int sourceBatchsize = ALIGN_UP4(c) * area; + int destBatchSize = c * area; + for (int bi = 0; bi < b; ++bi) { + auto srcBatch = source + bi * sourceBatchsize; + auto dstBatch = dest + bi * destBatchSize; + MNNPackTransposeInt16(dstBatch, srcBatch, area, c); + } +} + static void _NHWC2NC4HW4Uint8(const uint8_t* source, uint8_t* dest, int b, int c, int area) { int sourceBatchsize = c * area; int destBatchSize = ALIGN_UP4(c) * area; @@ -34,8 +44,17 @@ static void _NHWC2NC4HW4Uint8(const uint8_t* source, uint8_t* dest, int b, int c MNNUnpackTransposeUint8(dstBatch, srcBatch, area, c); } } +static void _NHWC2NC4HW4Int16(const int16_t* source, int16_t* dest, int b, int c, int area) { + int sourceBatchsize = c * area; + int destBatchSize = ALIGN_UP4(c) * area; + for (int bi = 0; bi < b; ++bi) { + auto srcBatch = source + bi * sourceBatchsize; + auto dstBatch = dest + bi * destBatchSize; + MNNUnpackTransposeInt16(dstBatch, srcBatch, area, c); + } +} -void CPUTensorConverter::NC4HW42NHWC(const float* source, float* dest, int b, int c, int area) { +static void NC4HW42NHWC(const float* source, float* dest, int b, int c, int area) { int sourceBatchsize = ALIGN_UP4(c) * area; int destBatchSize = c * area; for (int bi = 0; bi < b; ++bi) { @@ -45,7 +64,7 @@ void CPUTensorConverter::NC4HW42NHWC(const float* source, float* dest, int b, in } } -void CPUTensorConverter::NHWC2NC4HW4(const float* source, float* dest, int b, int c, int area) { +static void NHWC2NC4HW4(const float* source, float* dest, int b, int c, int area) { int sourceBatchsize = c * area; int destBatchSize = ALIGN_UP4(c) * area; for (int bi = 0; bi < b; ++bi) { @@ -55,7 +74,8 @@ void CPUTensorConverter::NHWC2NC4HW4(const float* source, float* dest, int b, in } } -void CPUTensorConverter::NCHW2NHWC(const float* source, float* dest, int b, int c, int area) { +template +void NCHW2NHWC(const T* source, T* dest, int b, int c, int area) { int sourceBatchsize = c * area; int destBatchSize = sourceBatchsize; for (int bi = 0; bi < b; ++bi) { @@ -71,7 +91,8 @@ void CPUTensorConverter::NCHW2NHWC(const float* source, float* dest, int b, int } } -void CPUTensorConverter::NHWC2NCHW(const float* source, float* dest, int b, int c, int area) { +template +void NHWC2NCHW(const T* source, T* dest, int b, int c, int area) { int sourceBatchsize = c * area; int destBatchSize = sourceBatchsize; for (int bi = 0; bi < b; ++bi) { @@ -91,6 +112,13 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN auto channelC4 = UP_DIV(channel, 4); auto batchStrideC4 = channelC4 * area * 4; auto batchStride = area * channel; + + // the case when source and dest data layout are the same + // This case occurs in BackendTest of BF16 data. + if(source == dest) { + ::memcpy(outputRaw, inputRaw, batch * area * channel * bitLength); + return NO_ERROR; + } if (MNN_DATA_FORMAT_NC4HW4 == source && MNN_DATA_FORMAT_NCHW == dest) { if (bitLength == 1) { for (int i = 0; i < batch; ++i) { @@ -99,8 +127,12 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN } return NO_ERROR; } - if (bitLength != 4) { - return INVALID_VALUE; + if (bitLength == 2) { + for (int i = 0; i < batch; ++i) { + MNNUnpackC4Int16((int16_t*)outputRaw + batchStride * i, + (const int16_t*)inputRaw + batchStrideC4 * i, area, channel); + } + return NO_ERROR; } for (int i = 0; i < batch; ++i) { MNNUnpackC4((float*)outputRaw + batchStride * i, (const float*)inputRaw + batchStrideC4 * i, area, channel); @@ -115,8 +147,11 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN } return NO_ERROR; } - if (bitLength != 4) { - return INVALID_VALUE; + if (bitLength == 2) { + for (int i = 0; i < batch; ++i) { + MNNPackC4Int16((int16_t*)outputRaw + batchStrideC4 * i, (const int16_t*)inputRaw + batchStride * i, area, channel); + } + return NO_ERROR; } for (int i = 0; i < batch; ++i) { MNNPackC4((float*)outputRaw + batchStrideC4 * i, (const float*)inputRaw + batchStride * i, area, channel); @@ -127,32 +162,54 @@ ErrorCode CPUTensorConverter::convert(const void* inputRaw, void* outputRaw, MNN if (MNN_DATA_FORMAT_NHWC == source && MNN_DATA_FORMAT_NC4HW4 == dest) { if (bitLength == 1) { _NHWC2NC4HW4Uint8((uint8_t*)inputRaw, (uint8_t*)outputRaw, batch, channel, area); + } else if (bitLength == 2){ + _NHWC2NC4HW4Int16((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area); } else { NHWC2NC4HW4((float*)inputRaw, (float*)outputRaw, batch, channel, area); } } else if (MNN_DATA_FORMAT_NC4HW4 == source && MNN_DATA_FORMAT_NHWC == dest) { if (bitLength == 1) { _NC4HW42NHWCUint8((uint8_t*)inputRaw, (uint8_t*)outputRaw, batch, channel, area); + } else if (bitLength == 2){ + _NC4HW42NHWCInt16((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area); } else { NC4HW42NHWC((float*)inputRaw, (float*)outputRaw, batch, channel, area); } } else if (MNN_DATA_FORMAT_NHWC == source && MNN_DATA_FORMAT_NCHW == dest) { - if (bitLength != 4) { - return NOT_SUPPORT; + switch (bitLength) { + case 1: + NHWC2NCHW((int8_t*)inputRaw, (int8_t*)outputRaw, batch, channel, area); + break; + case 2: + NHWC2NCHW((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area); + break; + case 4: + NHWC2NCHW((float*)inputRaw, (float*)outputRaw, batch, channel, area); + break; + default: + break; } - NHWC2NCHW((float*)inputRaw, (float*)outputRaw, batch, channel, area); } else if (MNN_DATA_FORMAT_NCHW == source && MNN_DATA_FORMAT_NHWC == dest) { - if (bitLength != 4) { - return NOT_SUPPORT; + switch (bitLength) { + case 1: + NCHW2NHWC((int8_t*)inputRaw, (int8_t*)outputRaw, batch, channel, area); + break; + case 2: + NCHW2NHWC((int16_t*)inputRaw, (int16_t*)outputRaw, batch, channel, area); + break; + case 4: + NCHW2NHWC((float*)inputRaw, (float*)outputRaw, batch, channel, area); + break; + default: + break; } - NCHW2NHWC((float*)inputRaw, (float*)outputRaw, batch, channel, area); } else { return NOT_SUPPORT; } return NO_ERROR; } -static std::tuple _splitDimensions(const halide_buffer_t& ib, MNN_DATA_FORMAT source) { +std::tuple CPUTensorConverter::splitDimensions(const halide_buffer_t& ib, MNN_DATA_FORMAT source) { int area = 1, batch = ib.dim[0].extent, channel; if (source == MNN_DATA_FORMAT_NC4HW4 || source == MNN_DATA_FORMAT_NCHW) { channel = ib.dim[1].extent; @@ -180,7 +237,7 @@ ErrorCode CPUTensorConverter::convert(const Tensor* input, const Tensor* output) MNN_ERROR("unknown data format!\nsrc: %s, dst: %s\n", EnumNameMNN_DATA_FORMAT(source), EnumNameMNN_DATA_FORMAT(dest)); return INVALID_VALUE; } - auto tup = _splitDimensions(ib, source); + auto tup = splitDimensions(ib, source); int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup); const int bitLength = ib.type.bytes(); auto code = convert(ib.host, ob.host, source, dest, batch, area, channel, bitLength); @@ -206,7 +263,7 @@ ErrorCode CPUTensorConverter::onExecute(const std::vector& inputs, cons MNN_ERROR("unknown data format!\nsrc: %s, dst: %s\n", EnumNameMNN_DATA_FORMAT(source), EnumNameMNN_DATA_FORMAT(dest)); return INVALID_VALUE; } - auto tup = _splitDimensions(ib, source); + auto tup = splitDimensions(ib, source); int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup); const int bitLength = ib.type.bytes(); diff --git a/source/backend/cpu/CPUTensorConvert.hpp b/source/backend/cpu/CPUTensorConvert.hpp index cdf802f0..c5b9243b 100644 --- a/source/backend/cpu/CPUTensorConvert.hpp +++ b/source/backend/cpu/CPUTensorConvert.hpp @@ -20,12 +20,7 @@ public: // Do nothing } virtual ~CPUTensorConverter() = default; - - static void NHWC2NC4HW4(const float* source, float* dest, int b, int c, int area); - static void NC4HW42NHWC(const float* dest, float* source, int b, int c, int area); - static void NHWC2NCHW(const float* dest, float* source, int b, int c, int area); - static void NCHW2NHWC(const float* source, float* dest, int b, int c, int area); - + static std::tuple splitDimensions(const halide_buffer_t& ib, MNN_DATA_FORMAT source); static ErrorCode convert(const Tensor* input, const Tensor* output); static ErrorCode convert(const void* inputRaw, void* outputRaw, MNN_DATA_FORMAT inputFormat, MNN_DATA_FORMAT outputFormat, int batch, int area, int channel, int bytes); virtual ErrorCode onExecute(const std::vector& inputs, const std::vector& outputs) override; diff --git a/source/backend/cpu/CPUUnary.cpp b/source/backend/cpu/CPUUnary.cpp index cf93f8ab..ce61888c 100644 --- a/source/backend/cpu/CPUUnary.cpp +++ b/source/backend/cpu/CPUUnary.cpp @@ -16,8 +16,6 @@ #include #include #include -#include "CPUTanh.hpp" -#include "CPUSigmoid.hpp" namespace MNN { CPUUnary::CPUUnary(Backend *b, UnaryOpOperation type) : MNN::Execution(b), mType(type) { @@ -31,21 +29,13 @@ ErrorCode CPUUnary::onResize(const std::vector &inputs, const std::vec } template -static ErrorCode _unaryOp(void* inputPtr, void* outputPtr, int elementSize, Backend* bn) { +static void _unaryOp(void* inputPtr, void* outputPtr, int elementSize) { Func f; - auto backend = [bn]() { - return bn; - }; const T *inputData = (T*)inputPtr; T *outputData = (T *)outputPtr; - auto numberThread = ((CPUBackend*)bn)->threadNumber(); - MNN_CONCURRENCY_BEGIN(tId, numberThread) { - for (int i=tId; i @@ -363,13 +353,17 @@ ErrorCode CPUUnary::onExecute(const std::vector &inputs, const std::ve if (dtype == halide_type_int) { switch (mType) { case UnaryOpOperation_ABS: - return _unaryOp, int32_t>(input->host(), output->host(), input->elementSize(), backend()); + _unaryOp, int32_t>(input->host(), output->host(), input->elementSize()); + break; case UnaryOpOperation_NEG: - return _unaryOp, int32_t>(input->host(), output->host(), input->elementSize(), backend()); + _unaryOp, int32_t>(input->host(), output->host(), input->elementSize()); + break; case UnaryOpOperation_SQUARE: - return _unaryOp, int32_t>(input->host(), output->host(), input->elementSize(), backend()); + _unaryOp, int32_t>(input->host(), output->host(), input->elementSize()); + break; case UnaryOpOperation_SIGN: - return _unaryOp, int32_t>(input->host(), output->host(), input->elementSize(), backend()); + _unaryOp, int32_t>(input->host(), output->host(), input->elementSize()); + break; default: MNN_ERROR("Int-Unary not support %d\n", mType); break; @@ -380,105 +374,126 @@ ErrorCode CPUUnary::onExecute(const std::vector &inputs, const std::ve auto schedule = ((CPUBackend*)backend())->multiThreadDivide(size); auto inputPtr = input->host(); auto outputPtr = output->host(); - switch (mType) { - case UnaryOpOperation_ABS: { - MNN_CONCURRENCY_BEGIN(tId, schedule.second) { - int start = schedule.first * (int)tId; - int realSize = schedule.first; - if (tId == schedule.second -1 ) { - realSize = size - start; - } - if (realSize > 0) { - MNNReluWithSlopeCommon(outputPtr + start, inputPtr + start, realSize, -1.0f); - } - } - MNN_CONCURRENCY_END(); - return NO_ERROR; + auto precision = static_cast(backend())->precisionMode(); + MNN_CONCURRENCY_BEGIN(tId, schedule.second) { + int start = schedule.first * (int)tId; + int realSize = schedule.first; + if (tId == schedule.second -1 ) { + realSize = size - start; } - case UnaryOpOperation_SQUARE: { - MNN_CONCURRENCY_BEGIN(tId, schedule.second) { - int start = schedule.first * (int)tId; - int realSize = schedule.first; - if (tId == schedule.second -1 ) { - realSize = size - start; - } - if (realSize > 0) { - MNNMatrixProdCommon(outputPtr + start, inputPtr + start, inputPtr + start, realSize, 0, 0, 0, 1); - } + if (realSize > 0) { + auto inp = inputPtr + start; + auto out = outputPtr + start; + switch (mType) { + case UnaryOpOperation_ABS: + MNNReluWithSlopeCommon(out, inp, realSize, -1.0f); + break; + case UnaryOpOperation_SQUARE: + MNNMatrixProdCommon(out, inp, inp, realSize, 0, 0, 0, 1); + break; + case UnaryOpOperation_NEG: + MNNScaleAndAddBiasScalar(out, inp, 0.0f, -1.0f, realSize); + break; + case UnaryOpOperation_RSQRT: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_EXP: + MNNScaleAndAddBiasScalar(out, inp, 0.0f, -1.0f, realSize); + MNNExp(out, out, realSize); + break; + case UnaryOpOperation_COS: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_SIN: + MNNSin(out, inp, realSize); + break; + case UnaryOpOperation_SIGMOID: + if (BackendConfig::Precision_Low == precision) { + MNNSigmoidLowp(out, inp, realSize); + } else { + MNNSigmoid(out, inp, realSize); + } + break; + case UnaryOpOperation_TANH: + MNNTanh(out, inp, realSize); + break; + case UnaryOpOperation_TAN: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ATAN: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_SQRT: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_CEIL: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_RECIPROCAL: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_LOG1P: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_LOG: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_FLOOR: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_BNLL: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ACOSH: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_SINH: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ASINH: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ATANH: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_SIGN: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ROUND: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_COSH: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ERF: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ERFC: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ERFINV: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_EXPM1: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ASIN: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_ACOS: + _unaryOp, float>(inp, out, realSize); + break; + case UnaryOpOperation_HARDSWISH: + MNNHardSwishCommon(out, inp, realSize); + break; + default: + MNN_ASSERT(false); + break; } - MNN_CONCURRENCY_END(); - return NO_ERROR; } - case UnaryOpOperation_RSQRT: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_NEG: { - MNN_CONCURRENCY_BEGIN(tId, schedule.second) { - int start = schedule.first * (int)tId; - int realSize = schedule.first; - if (tId == schedule.second -1 ) { - realSize = size - start; - } - if (realSize > 0) { - MNNScaleAndAddBiasScalar(outputPtr + start, inputPtr + start, 0.0f, -1.0f, realSize); - } - } - MNN_CONCURRENCY_END(); - return NO_ERROR; - } - case UnaryOpOperation_EXP: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_COS: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_SIN: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_TAN: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ATAN: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_SQRT: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_CEIL: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_RECIPROCAL: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_LOG1P: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_LOG: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_FLOOR: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_BNLL: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ACOSH: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_SINH: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ASINH: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ATANH: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_SIGN: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ROUND: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_COSH: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ERF: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ERFC: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ERFINV: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_EXPM1: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ASIN: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - case UnaryOpOperation_ACOS: - return _unaryOp, float>(input->host(), output->host(), input->elementSize(), backend()); - default: - MNN_ASSERT(false); - break; } + MNN_CONCURRENCY_END(); + return NO_ERROR; } @@ -487,13 +502,6 @@ class CPUUnaryCreator : public CPUBackend::Creator { public: virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, const MNN::Op *op, Backend *backend) const override { - auto opType = op->main_as_UnaryOp()->opType(); - if (UnaryOpOperation_SIGMOID == opType) { - return new CPUSigmoid(backend); - } - if (UnaryOpOperation_TANH == opType) { - return new CPUTanh(backend); - } return new CPUUnary(backend, op->main_as_UnaryOp()->opType()); } }; diff --git a/source/backend/cpu/arm/CMakeLists.txt b/source/backend/cpu/arm/CMakeLists.txt index 0cc43d86..46267131 100644 --- a/source/backend/cpu/arm/CMakeLists.txt +++ b/source/backend/cpu/arm/CMakeLists.txt @@ -1,10 +1,16 @@ IF(NOT DEFINED ARCHS) set(ARCHS ${CMAKE_SYSTEM_PROCESSOR}) ENDIF() -FILE(GLOB MNN_AArch32_SRC ${CMAKE_CURRENT_LIST_DIR}/arm32/*.s ${CMAKE_CURRENT_LIST_DIR}/arm32/*.S) -FILE(GLOB MNN_AArch64_SRC ${CMAKE_CURRENT_LIST_DIR}/arm64/*.s ${CMAKE_CURRENT_LIST_DIR}/arm64/*.S) -FILE(GLOB MNN_NEON_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp) +FILE(GLOB MNN_AArch32_SRC ${CMAKE_CURRENT_LIST_DIR}/arm32/*.[sS]) +FILE(GLOB MNN_AArch64_SRC ${CMAKE_CURRENT_LIST_DIR}/arm64/*.[sS]) +FILE(GLOB MNN_NEON_SRC ${CMAKE_CURRENT_LIST_DIR}/CommonOptFunctionNeon.cpp) +if (MNN_SUPPORT_BF16) + FILE(GLOB MNN_NEON_SRC ${MNN_NEON_SRC} ${CMAKE_CURRENT_LIST_DIR}/CommonNeonBF16.cpp) +else() + LIST(FILTER MNN_AArch32_SRC EXCLUDE REGEX ".*BF16.*") + LIST(FILTER MNN_AArch64_SRC EXCLUDE REGEX ".*BF16.*") +endif() # remove the armv82 extension assemblies file if(NOT MNN_ARM82) @@ -28,8 +34,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64") if(MNN_ARM82) message(STATUS "Enable INT8 SDOT") - # add_definitions(-DENABLE_ARMV82) - target_compile_options(MNNARM64 PRIVATE -march=armv8.2-a+dotprod) + target_compile_options(MNNARM64 PRIVATE -march=armv8.2-a+dotprod -DENABLE_ARMV82) endif() else() diff --git a/source/backend/cpu/arm/CommonNeonBF16.cpp b/source/backend/cpu/arm/CommonNeonBF16.cpp new file mode 100644 index 00000000..cc646511 --- /dev/null +++ b/source/backend/cpu/arm/CommonNeonBF16.cpp @@ -0,0 +1,94 @@ +#include "core/Macro.h" + +#include "../compute/CommonOptFunction.h" +#include "./FunctionSummary.hpp" +// todo: search for proper value for bf16 +void NEON_MNNGetMatMulPackMode_BF16(int* eP, int* lP, int* hP) { + *eP = 12; + *lP = 1; +#ifdef __aarch64__ + *hP = 8; +#else + *hP = 4; +#endif +} + + +#ifdef __aarch64__ +void NEON_MNNPackForMatMul_B_BF16(float* destFloat, const float* sourceFloat, size_t h, size_t l, bool transpose) { + auto hP = (int)h / 8; + auto hR = (int)hP * 8; + int16_t* dest = (int16_t*)destFloat; + int16_t* source = (int16_t*)sourceFloat; + if (hR != h) { + ::memset(dest, 0, UP_DIV(h, 8) * 8 * l * sizeof(int16_t)); + } + if (!transpose) { + for (int y = 0; y < hP; ++y) { + auto destY = dest + y * 8 * l; + auto sourceY = source + y * 8; + for (int x = 0; x < l; ++x) { + ::memcpy(destY + 8 * x, sourceY + x * h, 8 * sizeof(int16_t)); + } + } + auto hRemain = h - hR; + if (hRemain > 0) { + auto destY = dest + hP * 8 * l; + auto sourceY = source + hP * 8; + for (int x = 0; x < l; ++x) { + ::memcpy(destY + 8 * x, sourceY + x * h, hRemain * sizeof(int16_t)); + } + } + return; + } + int lC8 = (int)l / 8; + auto lR = lC8 * 8; + if (hP > 0 && lC8 > 0) { + MNNPackC8_BF16(destFloat, sourceFloat, l, h); + } + for (int y = hR; y < h; ++y) { + auto yR = y % 8; + auto yC = hP; + for (int x = 0; x < l; ++x) { + dest[x * 8 + yR + yC * 8 * l] = source[x + y * l]; + } + } + for (int y = 0; y < hR; ++y) { + auto yR = y % 8; + auto yC = y / 8; + for (int x = lR; x < l; ++x) { + dest[x * 8 + yR + yC * 8 * l] = source[x + y * l]; + } + } +} + +#else +void NEON_MNNPackForMatMul_B_BF16(float* destFloat, const float* sourceFloat, size_t h, size_t l, bool transpose) { + int16_t* dest = (int16_t*)destFloat; + int16_t* source = (int16_t*)sourceFloat; + if (!transpose) { + auto hP = h / 4; + auto hR = hP * 4; + if (hR != h) { + ::memset(dest, 0, UP_DIV(h, 4) * 4 * l * sizeof(int16_t)); + } + for (int y = 0; y < hP; ++y) { + auto destY = dest + y * 4 * l; + auto sourceY = source + y * 4; + for (int x = 0; x < l; ++x) { + ::memcpy(destY + 4 * x, sourceY + x * h, 4 * sizeof(int16_t)); + } + } + auto hRemain = h - hR; + if (hRemain > 0) { + auto destY = dest + hP * 4 * l; + auto sourceY = source + hP * 4; + for (int x = 0; x < l; ++x) { + ::memcpy(destY + 4 * x, sourceY + x * h, hRemain * sizeof(int16_t)); + } + } + return; + } + MNNPackC4_BF16(destFloat, sourceFloat, l, h); +} +#endif diff --git a/source/backend/cpu/arm/CommonOptFunctionNeon.cpp b/source/backend/cpu/arm/CommonOptFunctionNeon.cpp index 72647f1d..04982dcb 100644 --- a/source/backend/cpu/arm/CommonOptFunctionNeon.cpp +++ b/source/backend/cpu/arm/CommonOptFunctionNeon.cpp @@ -2,6 +2,7 @@ #include "../compute/CommonOptFunction.h" #ifdef MNN_USE_NEON #include +#include "./FunctionSummary.hpp" extern "C" { void MNNTranspose32Bit4x4(int32_t* dstO, const int32_t* srcO, int32_t* dim); } @@ -36,6 +37,7 @@ void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) { } } } + void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) { *eP = 12; *lP = 1; @@ -47,10 +49,9 @@ void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) { } #ifdef __aarch64__ -extern "C" { -void MNNPackC8(float* dest, const float* source, size_t l, size_t h); -} +// input shape is (l, h) when transpose=false, else input shape is (h, l) +// output shape is (UP_DIV(h, 8), l, 8) void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) { auto hP = (int)h / 8; auto hR = (int)hP * 8; @@ -124,4 +125,5 @@ void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bo } #endif + #endif diff --git a/source/backend/cpu/arm/FunctionSummary.hpp b/source/backend/cpu/arm/FunctionSummary.hpp new file mode 100644 index 00000000..55eeea5a --- /dev/null +++ b/source/backend/cpu/arm/FunctionSummary.hpp @@ -0,0 +1,57 @@ +// +// FunctionSummary.hpp +// MNN +// +// Created by MNN on 2021/02/23. +// Copyright © 2018 - 2021 Alibaba Group Holding Limited + +#ifndef FUNCTIONSUMMARY_HPP_ +#define FUNCTIONSUMMARY_HPP_ + +#include +#include +#include +#include "core/Macro.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __aarch64__ +void MNNPackC8(float* dest, const float* source, size_t l, size_t h); +#endif + +#if defined(MNN_SUPPORT_BF16) +void NEON_MNNGetMatMulPackMode_BF16(int* eP, int* lP, int* hP); + +void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, + const int32_t* el); + + +void NEON_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose); + +void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, + const float* postParameters, const float* bias); +void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, + const float* postParameters, const float* bias); + +void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, + size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); +void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width, + size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, + size_t height, size_t srcHStep, size_t dstHStep); +void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, + size_t aStride, size_t height, const float* parameters); + +void MNNPackC4_BF16(float* dest, const float* source, size_t area, size_t depth); +#ifdef __aarch64__ +void MNNPackC8_BF16(float* dest, const float* source, size_t l, size_t h); +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/source/backend/cpu/arm/arm32/MNNAddBias.S b/source/backend/cpu/arm/arm32/MNNAddBias.S deleted file mode 100644 index d9dfa534..00000000 --- a/source/backend/cpu/arm/arm32/MNNAddBias.S +++ /dev/null @@ -1,71 +0,0 @@ -// -// MNNAddBias.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __arm__ -#ifndef __aarch64__ -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNAddBias -//void MNNAddBias(float* dst, const float* bias, int planeNumber, int biasNumber) -//r0:dst, r1:bias, r2:planeNumber, r3:biasNumber -push {r4, r5, lr} - -cmp r3, #0 -beq End - -cmp r2, #0 -beq End - -LoopBias: -vld1.32 {q15}, [r1]! - -mov r4, r2 - -L4: -cmp r4, #3 -ble L1 -Loop4: -mov r5, r0 -vld1.32 {q0, q1}, [r5]! -vadd.f32 q0, q0, q15 -vld1.32 {q2, q3}, [r5] -vadd.f32 q1, q1, q15 -vadd.f32 q2, q2, q15 -vst1.32 {q0, q1}, [r0]! -vadd.f32 q3, q3, q15 -vst1.32 {q2, q3}, [r0]! -sub r4, r4, #4 -cmp r4, #4 -bge Loop4 - -L1: -cmp r4, #0 -beq EndLoopPlane -Loop1: -vld1.32 {q0}, [r0] -vadd.f32 q0, q0, q15 -subs r4, r4, #1 -vst1.32 {q0}, [r0]! -bne Loop1 - -EndLoopPlane: - -subs r3, r3, #1 -bne LoopBias - - -End: - - -pop {r4, r5, pc} - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNAddBiasRelu.S b/source/backend/cpu/arm/arm32/MNNAddBiasRelu.S deleted file mode 100644 index 616ad929..00000000 --- a/source/backend/cpu/arm/arm32/MNNAddBiasRelu.S +++ /dev/null @@ -1,77 +0,0 @@ -// -// MNNAddBiasRelu.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __arm__ -#ifndef __aarch64__ -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNAddBiasRelu -//void MNNAddBiasRelu(float* dst, const float* bias, int planeNumber, int biasNumber) -//r0:dst, r1:bias, r2:planeNumber, r3:biasNumber -push {r4, r5, lr} - -cmp r3, #0 -beq BiasReluEnd - -cmp r2, #0 -beq BiasReluEnd - -vmov.i32 q14, #0 -ReluLoopBias: -vld1.32 {q15}, [r1]! - -mov r4, r2 - -ReluBiasReluL4: -cmp r4, #3 -ble BiasReluL1 -ReluLoop4: -mov r5, r0 -vld1.32 {q0, q1}, [r5]! -vadd.f32 q0, q0, q15 -vadd.f32 q1, q1, q15 -vld1.32 {q2, q3}, [r5] -vmax.f32 q0, q0, q14 -vmax.f32 q1, q1, q14 -vadd.f32 q2, q2, q15 -vst1.32 {q0, q1}, [r0]! -vmax.f32 q2, q2, q14 -vadd.f32 q3, q3, q15 -vmax.f32 q3, q3, q14 -vst1.32 {q2, q3}, [r0]! -sub r4, r4, #4 -cmp r4, #4 -bge ReluLoop4 - -BiasReluL1: -cmp r4, #0 -beq EndReluLoopPlane -ReluLoop1: -vld1.32 {q0}, [r0] -vadd.f32 q0, q0, q15 -vmax.f32 q0, q0, q14 -subs r4, r4, #1 -vst1.32 {q0}, [r0]! -bne ReluLoop1 - -EndReluLoopPlane: - -subs r3, r3, #1 -bne ReluLoopBias - - -BiasReluEnd: - - -pop {r4, r5, pc} - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNAddBiasRelu6.S b/source/backend/cpu/arm/arm32/MNNAddBiasRelu6.S deleted file mode 100644 index 7290ba9e..00000000 --- a/source/backend/cpu/arm/arm32/MNNAddBiasRelu6.S +++ /dev/null @@ -1,85 +0,0 @@ -// -// MNNAddBiasRelu6.S -// MNN -// -// Created by MNN on 2019/01/22. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "MNNAsmGlobal.h" -#ifdef __arm__ -#ifndef __aarch64__ - -.text -.align 5 -asm_function MNNAddBiasRelu6 -//void MNNAddBiasRelu6(float* dst, const float* bias, int planeNumber, int biasNumber) -//r0:dst, r1:bias, r2:planeNumber, r3:biasNumber -push {r4, r5, lr} - -cmp r3, #0 -beq BiasReluEnd - -cmp r2, #0 -beq BiasReluEnd - -vmov.i32 q14, #0 -vmov.i32 q13, #6 -vcvt.f32.s32 q13, q13 -ReluLoopBias: - vld1.32 {q15}, [r1]! - - mov r4, r2 - - ReluBiasReluL4: - cmp r4, #3 - ble BiasReluL1 - ReluLoop4: - mov r5, r0 - vld1.32 {q0, q1}, [r5]! - vadd.f32 q0, q0, q15 - vadd.f32 q1, q1, q15 - vld1.32 {q2, q3}, [r5] - vmax.f32 q0, q0, q14 - vmax.f32 q1, q1, q14 - vmin.f32 q0, q0, q13 - vmin.f32 q1, q1, q13 - vadd.f32 q2, q2, q15 - vst1.32 {q0, q1}, [r0]! - vmax.f32 q2, q2, q14 - vadd.f32 q3, q3, q15 - vmin.f32 q2, q2, q13 - vmax.f32 q3, q3, q14 - vmin.f32 q3, q3, q13 - vst1.32 {q2, q3}, [r0]! - sub r4, r4, #4 - cmp r4, #4 - bge ReluLoop4 - - BiasReluL1: - cmp r4, #0 - beq EndReluLoopPlane - ReluLoop1: - vld1.32 {q0}, [r0] - vadd.f32 q0, q0, q15 - vmax.f32 q0, q0, q14 - vmin.f32 q0, q0, q13 - subs r4, r4, #1 - vst1.32 {q0}, [r0]! - bne ReluLoop1 - - EndReluLoopPlane: - - subs r3, r3, #1 - bne ReluLoopBias - - -BiasReluEnd: - - -pop {r4, r5, pc} - - - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S index fbb269eb..7d9ae56f 100644 --- a/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S +++ b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4.S @@ -1,5 +1,5 @@ // -// MNNMatrixSub.S +// MNNAxByClampBroadcastUnit.S // MNN // // Created by MNN on 2020/06/20. @@ -14,8 +14,8 @@ .text .align 5 -asm_function MNNAxByClampBroadcastC4 -//void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) +asm_function MNNAxByClampBroadcastUnit +//void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) //Auto: r0: C, r1:A, r2:B, r3:width //r4:cStride, r5:aStride, r6:height, r7:parameters push {r4-r11, lr} diff --git a/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4_BF16.S b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4_BF16.S new file mode 100644 index 00000000..d2b5a3fe --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNAxByClampBroadcastC4_BF16.S @@ -0,0 +1,67 @@ +// +// NEON_MNNAxByClampBroadcastC4_BF16.S +// MNN +// +// Created by MNN on 2021/03/09. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNAxByClampBroadcastC4_BF16 +//void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) +//Auto: r0: C, r1:A, r2:B, r3:width +//r4:cStride, r5:aStride, r6:height, r7:parameters +push {r4-r11, lr} +ldr r4, [sp, #36] +ldr r5, [sp, #40] +ldr r6, [sp, #44] +ldr r7, [sp, #48] + + +vld1.32 {q3}, [r7] +vdup.f32 q14, d7[0] +vdup.f32 q15, d7[1] +mov r12, #2 //sizeof(int16_t) +mul r4, r12, r4 +mul r5, r12, r5 + +LoopY: +mov r8, r0 +mov r9, r1 +vld1.16 {d26}, [r2]! +vshll.s16 q13, d26, #16 +mov r11, r3 + +L1: +cmp r11, #0 +beq EndLine + +L1Loop: +vld1.16 {d0}, [r1]! +vshll.s16 q0, d0, #16 +vmla.f32 q0, q13, d6[1] +vmax.f32 q0, q0, q14 +vmin.f32 q0, q0, q15 +vshrn.i32 d0, q0, #16 +vst1.16 {d0}, [r0]! +subs r11, r11, #1 +bne L1Loop + +EndLine: +add r0, r8, r4 +add r1, r9, r5 + +subs r6, r6, #1 +bne LoopY + +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise_BF16.S b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise_BF16.S new file mode 100644 index 00000000..6a6c38b9 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise_BF16.S @@ -0,0 +1,238 @@ +// +// NEON_MNNConvRunForLineDepthwise_BF16.S +// MNN +// +// Created by MNN on 2021/03/09. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNConvRunForLineDepthwise_BF16 +//void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) + + +//Auto Load: +//r0:dst, r1:src, r2:weight, r3:width + +push {r4-r11, lr} + +//Load From Sp +//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep +ldr r4, [sp, #36] +ldr r5, [sp, #40] +ldr r6, [sp, #44] +ldr r7, [sp, #48] +ldr r8, [sp, #52] +ldr r9, [sp, #56] +ldr r10, [sp, #60] +ldr r11, [sp, #64] + +vpush {q4-q7} + +mov r12, #2 +mul r4, r12, r4 +mul r7, r12, r7 // r7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step +mul r8, r12, r8 // r8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step +mul r10, r12, r10 +mul r11, r12, r11 + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul r12, r5, r7 +sub r8, r8, r12 + +LoopDY: +push {r0, r1, r3, r9, r10, r11} + +L8: +cmp r3, #7 +ble L4 + +mov r12, #8 +mul r12, r4, r12 + +L8Loop: + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + vmov.i32 q12, #0 + vmov.i32 q13, #0 + vmov.i32 q14, #0 + vmov.i32 q15, #0 + + vmov.i32 d14[0], r1 + vmov.i32 d14[1], r2 + mov r9, r6 + L8LoopH: + mov r10, r5 + L8LoopW: + vld1.16 {d6}, [r2]! + vld1.16 {q0}, [r1], r4 + vshll.s16 q3, d6, #16 + vshll.s16 q0, d0, #16 + subs r10, r10, #1 + vmla.f32 q8, q3, q0 + vld1.16 {d2}, [r1], r4 + vshll.s16 q1, d2, #16 + + vmla.f32 q9, q3, q1 + vld1.16 {d0}, [r1], r4 + vshll.s16 q0, d0, #16 + vmla.f32 q10, q0, q3 + vld1.16 {d2}, [r1], r4 + vshll.s16 q1, d2, #16 + vmla.f32 q11, q1, q3 + vld1.16 {d0}, [r1], r4 + vshll.s16 q0, d0, #16 + vmla.f32 q12, q0, q3 + vld1.16 {d2}, [r1], r4 + vshll.s16 q1, d2, #16 + vmla.f32 q13, q1, q3 + vld1.16 {q0}, [r1], r4 + vshll.s16 q0, d0, #16 + vmla.f32 q14, q0, q3 + vld1.16 {d2}, [r1], r4 + vshll.s16 q1, d2, #16 + vmla.f32 q15, q1, q3 + + sub r1, r1, r12 + add r1, r1, r7 + + bne L8LoopW + L8LoopWEnd: + subs r9, r9, #1 + add r1, r1, r8 + bne L8LoopH + + sub r3, r3, #8 + vshrn.i32 d16, q8, #16 + vshrn.i32 d17, q9, #16 + vst1.16 {d16, d17}, [r0]! + vmov.i32 r1, d14[0] + vmov.i32 r2, d14[1] + vshrn.i32 d20, q10, #16 + vshrn.i32 d21, q11, #16 + vst1.16 {d20, d21}, [r0]! + add r1, r1, r12 + vshrn.i32 d24, q12, #16 + vshrn.i32 d25, q13, #16 + vst1.16 {d24, d25}, [r0]! + cmp r3, #8 + vshrn.i32 d28, q14, #16 + vshrn.i32 d29, q15, #16 + vst1.16 {d28, d29}, [r0]! + bge L8Loop + +L4: +cmp r3, #3 +ble L1 + +mov r12, #4 +mul r12, r4, r12 + +L4Loop: + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + + vmov.i32 d8[0], r1 + vmov.i32 d9[0], r2 + mov r9, r6 + L4LoopH: + mov r10, r5 + L4LoopW: + vld1.16 {d24}, [r2]! + vld1.16 {d0}, [r1], r4 + vshll.s16 q12, d24, #16 + vshll.s16 q0, d0, #16 + subs r10, r10, #1 + vmla.f32 q8, q12, q0 + vld1.16 {d2}, [r1], r4 + vshll.s16 q1, d2, #16 + vmla.f32 q9, q12, q1 + vld1.16 {d4}, [r1], r4 + vshll.s16 q2, d4, #16 + vmla.f32 q10, q2, q12 + vld1.16 {d6}, [r1], r4 + vshll.s16 q3, d6, #16 + sub r1, r1, r12 + vmla.f32 q11, q3, q12 + + add r1, r1, r7 + + bne L4LoopW + subs r9, r9, #1 + add r1, r1, r8 + bne L4LoopH + + sub r3, r3, #4 + vshrn.i32 d16, q8, #16 + vshrn.i32 d17, q9, #16 + vst1.16 {d16, d17}, [r0]! + vmov.i32 r1, d8[0] + vmov.i32 r2, d9[0] + vshrn.i32 d20, q10, #16 + vshrn.i32 d21, q11, #16 + vst1.16 {d20, d21}, [r0]! + add r1, r1, r12 + cmp r3, #4 + bge L4Loop + + + + +L1: +cmp r3, #0 +beq End + +L1Loop: + vmov.i32 q0, #0 + mov r9, r6 + mov r11, r1 + mov r12, r2 + L1LoopH: + mov r10, r5 + L1LoopW: + vld1.16 {d2}, [r1], r7 + vld1.16 {d4}, [r2]! + vshll.s16 q1, d2, #16 + vshll.s16 q2, d4, #16 + vmla.f32 q0, q1, q2 + subs r10, r10, #1 + bne L1LoopW + subs r9, r9, #1 + add r1, r1, r8 + bne L1LoopH + + subs r3, r3, #1 + vshrn.i32 d0, q0, #16 + vst1.16 {d0}, [r0]! + mov r2, r12 + add r1, r11, r4 + bne L1Loop + + +End: + +pop {r0, r1, r3, r9, r10, r11} +add r0, r0, r11 +subs r9, r9, #1 +add r1, r1, r10 +bne LoopDY + + +vpop {q4-q7} +pop {r4-r11, pc} + + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise_BF16.S b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise_BF16.S new file mode 100644 index 00000000..7dea49d0 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise_BF16.S @@ -0,0 +1,77 @@ +// +// NEON_MNNConvRunForUnitDepthWise_BF16.S +// MNN +// +// Created by MNN on 2021/03/09. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNConvRunForUnitDepthWise_BF16 +//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) + +//Auto: r0:dst, r1:src, r2:weight, r3:fw + +push {r4-r9, lr} + +//Load from sp: +//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step +mov r4, r3 +ldr r5, [sp, #28] +ldr r6, [sp, #32] +ldr r7, [sp, #36] +ldr r8, [sp, #40] + +cmp r4, #0 +vmov.i32 q0, #0 +beq UnitEnd +cmp r5, #0 +beq UnitEnd + +mov r9, #2 +mul r6, r9, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step +mul r7, r9, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step +mul r8, r9, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step + +//dilate_y_step -> dilate_y_step - dilate_x_step*fw +mul r9, r4, r7 +sub r8, r8, r9 + +//weight_y_step -> weight_y_step - 4*sizeof(float)*fw +mov r9, #8 +mul r9, r4, r9 +sub r6, r6, r9 + + +UnitLoopH: +mov r9, r4 +UnitLoopW: +vld1.16 {d2}, [r1], r7 +vld1.16 {d4}, [r2]! +vshll.s16 q1, d2, #16 +vshll.s16 q2, d4, #16 + +vmla.f32 q0, q1, q2 +subs r9, r9, #1 +bne UnitLoopW +subs r5, r5, #1 +add r1, r1, r8 +add r2, r2, r6 +bne UnitLoopH + + +UnitEnd: +vshrn.i32 d0, q0, #16 +vst1.16 {d0}, [r0] + +pop {r4-r9, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNFloat2Int8.S b/source/backend/cpu/arm/arm32/MNNFloat2Int8.S index 51d5fe60..46b68a2a 100644 --- a/source/backend/cpu/arm/arm32/MNNFloat2Int8.S +++ b/source/backend/cpu/arm/arm32/MNNFloat2Int8.S @@ -22,8 +22,8 @@ vcvt.s32.f32 \x, q13 .endm asm_function MNNFloat2Int8 -//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax); -//r0:src, r1:dst, r2:sizeQuad, r3:scale +//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax, ssize_t zeroPoint); +//r0:src, r1:dst, r2:sizeQuad, r3:scale, r4:aMin, r5:aMax, r6:zeroPoint push {lr} @@ -32,9 +32,15 @@ vmov.f32 q11, #-0.5 ldr r12, [sp, #4] vld1.32 {q15}, [r3] +// min vdup.s8 d28, r12 +// max ldr r12, [sp, #8] vdup.s8 d29, r12 +// zeropoint +ldr r12, [sp, #12] +vdup.s32 q9, r12 +vcvt.f32.s32 q9, q9 cmp r2, #3 ble FLLoop1 @@ -42,7 +48,9 @@ ble FLLoop1 FLLoop4: vld1.32 {q0, q1}, [r0]! vmul.f32 q0, q0, q15 +vadd.f32 q0, q0, q9 vmul.f32 q1, q1, q15 +vadd.f32 q1, q1, q9 vld1.32 {q2, q3}, [r0]! // vcvtr.s32.f32 s0, s0 // vcvtr.s32.f32 s1, s1 @@ -55,7 +63,9 @@ vld1.32 {q2, q3}, [r0]! _vroundq_f32 q10, q11, q0 _vroundq_f32 q10, q11, q1 vmul.f32 q2, q2, q15 +vadd.f32 q2, q2, q9 vmul.f32 q3, q3, q15 +vadd.f32 q3, q3, q9 // vcvtr.s32.f32 s8, s8 // vcvtr.s32.f32 s9, s9 // vcvtr.s32.f32 s10, s10 @@ -93,6 +103,7 @@ beq FLEnd FLLoop1: vld1.32 {q0}, [r0]! vmul.f32 q0, q0, q15 +vadd.f32 q0, q0, q9 // vcvtr.s32.f32 s0, s0 // vcvtr.s32.f32 s1, s1 // vcvtr.s32.f32 s2, s2 diff --git a/source/backend/cpu/arm/arm32/MNNGemmFloatCommon_4.S b/source/backend/cpu/arm/arm32/MNNGemmFloatCommon_4.S deleted file mode 100644 index a3bb7c5e..00000000 --- a/source/backend/cpu/arm/arm32/MNNGemmFloatCommon_4.S +++ /dev/null @@ -1,293 +0,0 @@ -// -// MNNGemmFloatCommon_4.S -// MNN -// -// Created by MNN on 2018/03/08. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __arm__ -#ifndef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNGemmFloatCommon_4 -//void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, -// size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) - -push {r4-r11, lr} - -//Auto Load: -//r0:dst, r1:src, r2:weight, r3: src_depth_quad - - -//Load from sp -//r4:dst_step, r5:dst_depth_quad, r6:width -ldr r4, [sp, #36] -ldr r5, [sp, #40] -ldr r6, [sp, #44] -ldr r9, [sp, #48] - -vpush {q4-q7} - -//step multi by sizeof(float) -mov r12, #4 -mul r4, r12, r4 -mul r9, r12, r9 - -//r7: src_z_step -mov r12, #16//4*sizeof(float) -mul r7, r12, r6 - -//r11: weight_dz_step -mov r12, #64 //16*sizeof(float) -mul r11, r12, r3 -add r11, r9, r11 - - -mov r9, r6 -LoopDz: -mov r8, r0 -mov r10, r1 -mov r12, r2 - -.macro START_TWO z0 z1 -vld1.32 {q0}, [r1]! -vmul.f32 \z0, q2, d0[0] -vld1.32 {q1}, [r1]! -vmla.f32 \z0, q3, d0[1] -vmul.f32 \z1, q2, d2[0] -vmla.f32 \z0, q4, d1[0] -vmla.f32 \z1, q3, d2[1] -vmla.f32 \z0, q5, d1[1] -vmla.f32 \z1, q4, d3[0] -vmla.f32 \z1, q5, d3[1] -.endm - -.macro COMPUTE_TWO z0 z1 -vld1.32 {q0}, [r1]! -vmla.f32 \z0, q2, d0[0] -vld1.32 {q1}, [r1]! -vmla.f32 \z0, q3, d0[1] -vmla.f32 \z1, q2, d2[0] -vmla.f32 \z0, q4, d1[0] -vmla.f32 \z1, q3, d2[1] -vmla.f32 \z0, q5, d1[1] -vmla.f32 \z1, q4, d3[0] -vmla.f32 \z1, q5, d3[1] -.endm - -.macro START_FOUR z0 z1 z2 z3 -vld1.32 {q0}, [r1]! -vmul.f32 \z0, q2, d0[0] -vld1.32 {q1}, [r1]! -vmla.f32 \z0, q3, d0[1] -vmul.f32 \z1, q2, d2[0] -vmla.f32 \z0, q4, d1[0] -vmla.f32 \z1, q3, d2[1] -vmla.f32 \z0, q5, d1[1] -vmla.f32 \z1, q4, d3[0] -vld1.32 {q0}, [r1]! -vmla.f32 \z1, q5, d3[1] -vmul.f32 \z2, q2, d0[0] -vld1.32 {q1}, [r1]! -vmla.f32 \z2, q3, d0[1] -vmul.f32 \z3, q2, d2[0] -vmla.f32 \z2, q4, d1[0] -vmla.f32 \z3, q3, d2[1] -vmla.f32 \z2, q5, d1[1] -vmla.f32 \z3, q4, d3[0] -vmla.f32 \z3, q5, d3[1] -.endm - -.macro COMPUTE_FOUR z0 z1 z2 z3 -vld1.32 {q0}, [r1]! -vmla.f32 \z0, q2, d0[0] -vld1.32 {q1}, [r1]! -vmla.f32 \z0, q3, d0[1] -vmla.f32 \z1, q2, d2[0] -vmla.f32 \z0, q4, d1[0] -vmla.f32 \z1, q3, d2[1] -vmla.f32 \z0, q5, d1[1] -vmla.f32 \z1, q4, d3[0] -vld1.32 {q0}, [r1]! -vmla.f32 \z1, q5, d3[1] -vmla.f32 \z2, q2, d0[0] -vld1.32 {q1}, [r1]! -vmla.f32 \z2, q3, d0[1] -vmla.f32 \z3, q2, d2[0] -vmla.f32 \z2, q4, d1[0] -vmla.f32 \z3, q3, d2[1] -vmla.f32 \z2, q5, d1[1] -vmla.f32 \z3, q4, d3[0] -vmla.f32 \z3, q5, d3[1] -.endm - -L4: -cmp r6, #3 -ble L2 - - -L4Loop: - vmov.i32 d30[0], r1 - vmov.i32 d30[1], r2 - vmov.i32 d31[1], r3 - vld1.32 {q4, q5}, [r2]! - vld1.32 {q6, q7}, [r2]! - - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - - vmul.f32 q8, q4, d0[0] - vmul.f32 q9, q4, d2[0] - vmul.f32 q10, q4, d4[0] - vmul.f32 q11, q4, d6[0] - - vmla.f32 q8, q5, d0[1] - vmla.f32 q9, q5, d2[1] - vmla.f32 q10, q5, d4[1] - vmla.f32 q11, q5, d6[1] - - vmla.f32 q8, q6, d1[0] - vmla.f32 q9, q6, d3[0] - vmla.f32 q10, q6, d5[0] - vmla.f32 q11, q6, d7[0] - - vmla.f32 q8, q7, d1[1] - vmla.f32 q9, q7, d3[1] - vmla.f32 q10, q7, d5[1] - vmla.f32 q11, q7, d7[1] - - subs r3, r3, #1 - beq L4LoopZEnd - L4LoopZ: - sub r1, r1, #64 - vld1.32 {q4, q5}, [r2]! - add r1, r1, r7 - vld1.32 {q6, q7}, [r2]! - - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - - vmla.f32 q8, q4, d0[0] - vmla.f32 q9, q4, d2[0] - vmla.f32 q10, q4, d4[0] - vmla.f32 q11, q4, d6[0] - - vmla.f32 q8, q5, d0[1] - vmla.f32 q9, q5, d2[1] - vmla.f32 q10, q5, d4[1] - vmla.f32 q11, q5, d6[1] - - vmla.f32 q8, q6, d1[0] - vmla.f32 q9, q6, d3[0] - vmla.f32 q10, q6, d5[0] - vmla.f32 q11, q6, d7[0] - - vmla.f32 q8, q7, d1[1] - vmla.f32 q9, q7, d3[1] - vmla.f32 q10, q7, d5[1] - vmla.f32 q11, q7, d7[1] - - subs r3, r3, #1 - bne L4LoopZ - L4LoopZEnd: - vmov.i32 r1, d30[0] - add r1, r1, #64 - vmov.i32 r2, d30[1] - vst1.32 {q8, q9}, [r8]! - sub r6, r6, #4 - vmov.i32 r3, d31[1] - cmp r6, #4 - vst1.32 {q10, q11}, [r8]! - bge L4Loop - -L2: -cmp r6, #2 -blt L1 - - -L2Loop: - vmov.i32 d30[0], r1 - vmov.i32 d30[1], r2 - vmov.i32 d31[1], r3 - vld1.32 {q2, q3}, [r2]! - vld1.32 {q4, q5}, [r2]! - - START_TWO q8, q9 - subs r3, r3, #1 - beq L2LoopZEnd - L2LoopZ: - sub r1, r1, #32 - vld1.32 {q2, q3}, [r2]! - add r1, r1, r7 - vld1.32 {q4, q5}, [r2]! - COMPUTE_TWO q8, q9 - subs r3, r3, #1 - bne L2LoopZ - L2LoopZEnd: - vmov.i32 r1, d30[0] - add r1, r1, #32 - vmov.i32 r2, d30[1] - vst1.32 {q8, q9}, [r8]! - sub r6, r6, #2 - vmov.i32 r3, d31[1] - cmp r6, #2 - bge L2Loop - - -L1: -cmp r6, #0 -beq End - -L1Loop: - vmov.i32 d16[0], r1 - vmov.i32 d16[1], r2 - vmov.i32 d17[0], r3 - vld1.32 {q3}, [r1], r7 - vld1.32 {q4, q5}, [r2]! - vmul.f32 q0, q4, d6[0] - vld1.32 {q6, q7}, [r2]! - vmul.f32 q1, q5, d6[1] - subs r3, r3, #1 - beq L1LoopZEnd - L1LoopZ: - vld1.32 {q4, q5}, [r2]! - vmla.f32 q0, q6, d7[0] - vmla.f32 q1, q7, d7[1] - vld1.32 {q3}, [r1], r7 - vmla.f32 q0, q4, d6[0] - vld1.32 {q6, q7}, [r2]! - vmla.f32 q1, q5, d6[1] - subs r3, r3, #1 - bne L1LoopZ - L1LoopZEnd: - vmla.f32 q0, q6, d7[0] - vmla.f32 q1, q7, d7[1] - - vadd.f32 q0, q0, q1 - vmov.i32 r1, d16[0] - vmov.i32 r2, d16[1] - vmov.i32 r3, d17[0] - add r1, r1, #16 - vst1.32 {q0}, [r8]! - subs r6, r6, #1 - bne L1Loop - -End: - -subs r5, r5, #1 -add r0, r0, r4 -mov r6, r9 -mov r1, r10 -add r2, r12, r11 -bne LoopDz - -vpop {q4-q7} -pop {r4-r11, pc} - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNGemmFloatOne_4.S b/source/backend/cpu/arm/arm32/MNNGemmFloatOne_4.S deleted file mode 100644 index 1cbc6629..00000000 --- a/source/backend/cpu/arm/arm32/MNNGemmFloatOne_4.S +++ /dev/null @@ -1,91 +0,0 @@ -// -// MNNGemmFloatOne_4.S -// MNN -// -// Created by MNN on 2019/02/14. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __arm__ -#ifndef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNGemmFloatOne_4 -//void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, -// size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset) - -push {r4-r11, lr} - -//Auto Load: -//r0:dst, r1:src, r2:weight, r3: src_depth_quad - - -//Load from sp -//r4:dst_step, r5:dst_depth_quad, r9:weight_depth_offset -ldr r4, [sp, #36] -ldr r5, [sp, #40] -ldr r9, [sp, #44] - - -//step multi by sizeof(float) -mov r12, #4 -mul r4, r12, r4 -mul r9, r12, r9 - -//r11: weight_dz_step -mov r12, #64 //16*sizeof(float) -mul r11, r12, r3 -add r11, r9, r11 - -mov r6, r3 -mov r10, r1 - -LoopDz: -mov r8, r0 -mov r12, r2 - -L1: -cmp r3, #0 -beq LZEnd - -vld1.32 {q0}, [r1]! -vld1.32 {q8, q9}, [r2]! -vmul.f32 q2, q8, d0[0] -vld1.32 {q10, q11}, [r2]! -subs r3, r3, #1 -vmul.f32 q3, q9, d0[1] -beq L1LoopZEnd -L1LoopZ: - vld1.32 {q8, q9}, [r2]! - vmla.f32 q2, q10, d1[0] - vmla.f32 q3, q11, d1[1] - vld1.32 {q0}, [r1]! - vmla.f32 q2, q8, d0[0] - vld1.32 {q10, q11}, [r2]! - vmla.f32 q3, q9, d0[1] - subs r3, r3, #1 - bne L1LoopZ -L1LoopZEnd: -vmla.f32 q2, q10, d1[0] -vmla.f32 q3, q11, d1[1] - -vadd.f32 q0, q2, q3 -vst1.32 {q0}, [r8]! - -LZEnd: - -subs r5, r5, #1 -add r0, r0, r4 -mov r1, r10 -add r2, r12, r11 -mov r3, r6 -bne LoopDz - -pop {r4-r11, pc} - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNGemmFloatUnit_4.S b/source/backend/cpu/arm/arm32/MNNGemmFloatUnit_4.S deleted file mode 100644 index d87d66ee..00000000 --- a/source/backend/cpu/arm/arm32/MNNGemmFloatUnit_4.S +++ /dev/null @@ -1,214 +0,0 @@ -// -// MNNGemmFloatUnit_4.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "MNNAsmGlobal.h" -#ifdef __arm__ -#ifndef __aarch64__ - -.text -.align 5 - -asm_function MNNGemmFloatUnit_4 -//void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset) - -//Auto: -//r0:dstOrigin, r1:src, r2: weight, r3:src_depth_quad - -//Load from sp - -//r4: dst_step, r5:dst_depth_quad -//r8: weightExtraOffset - -push {r4-r8, lr} -ldr r4, [sp, #24] -ldr r5, [sp, #28] -ldr r8, [sp, #32] -//step multi by sizeof(float) -mov r12, #4 -mul r4, r12, r4 -mul r8, r12, r8 - -vpush {q4-q7} - -L8Dz: - mov r6, r1 - mov r12, r0 - subs r7, r3, #1 - vld1.32 {q4, q5}, [r2]! - vld1.32 {q6, q7}, [r2]! - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - - vmul.f32 q8, q4, d0[0] - vmul.f32 q9, q4, d2[0] - vmul.f32 q10, q4, d4[0] - vmul.f32 q11, q4, d6[0] - - vmla.f32 q8, q5, d0[1] - vmla.f32 q9, q5, d2[1] - vmla.f32 q10, q5, d4[1] - vmla.f32 q11, q5, d6[1] - - vmla.f32 q8, q6, d1[0] - vmla.f32 q9, q6, d3[0] - vmla.f32 q10, q6, d5[0] - vmla.f32 q11, q6, d7[0] - - vmla.f32 q8, q7, d1[1] - vmla.f32 q9, q7, d3[1] - vmla.f32 q10, q7, d5[1] - vmla.f32 q11, q7, d7[1] - - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - - vmul.f32 q12, q4, d0[0] - vmul.f32 q13, q4, d2[0] - vmul.f32 q14, q4, d4[0] - vmul.f32 q15, q4, d6[0] - - vmla.f32 q12, q5, d0[1] - vmla.f32 q13, q5, d2[1] - vmla.f32 q14, q5, d4[1] - vmla.f32 q15, q5, d6[1] - - vmla.f32 q12, q6, d1[0] - vmla.f32 q13, q6, d3[0] - vmla.f32 q14, q6, d5[0] - vmla.f32 q15, q6, d7[0] - - vmla.f32 q12, q7, d1[1] - vmla.f32 q13, q7, d3[1] - vmla.f32 q14, q7, d5[1] - vmla.f32 q15, q7, d7[1] - beq L8LoopZEnd - - subs r7, r7, #1 - - vld1.32 {q4, q5}, [r2]! - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - - vmla.f32 q8, q4, d0[0] - vmla.f32 q9, q4, d2[0] - beq L8LoopZEndRemain - - L8LoopZ: - vmla.f32 q10, q4, d4[0] - vmla.f32 q11, q4, d6[0] - - vmla.f32 q8, q5, d0[1] - vmla.f32 q9, q5, d2[1] - vld1.32 {q6, q7}, [r2]! - vmla.f32 q10, q5, d4[1] - vmla.f32 q11, q5, d6[1] - - vmla.f32 q8, q6, d1[0] - vmla.f32 q9, q6, d3[0] - vmla.f32 q10, q6, d5[0] - vmla.f32 q11, q6, d7[0] - - vmla.f32 q8, q7, d1[1] - vmla.f32 q9, q7, d3[1] - vmla.f32 q10, q7, d5[1] - vld1.32 {q0, q1}, [r1]! - vmla.f32 q11, q7, d7[1] - - vld1.32 {q2, q3}, [r1]! - - vmla.f32 q12, q4, d0[0] - vmla.f32 q13, q4, d2[0] - vmla.f32 q14, q4, d4[0] - vmla.f32 q15, q4, d6[0] - - vmla.f32 q12, q5, d0[1] - vmla.f32 q13, q5, d2[1] - vmla.f32 q14, q5, d4[1] - vmla.f32 q15, q5, d6[1] - - vmla.f32 q12, q6, d1[0] - vmla.f32 q13, q6, d3[0] - vmla.f32 q14, q6, d5[0] - vld1.32 {q4, q5}, [r2]! - vmla.f32 q15, q6, d7[0] - - vmla.f32 q12, q7, d1[1] - vmla.f32 q13, q7, d3[1] - vmla.f32 q14, q7, d5[1] - vld1.32 {q0, q1}, [r1]! - vmla.f32 q15, q7, d7[1] - - vld1.32 {q2, q3}, [r1]! - - vmla.f32 q8, q4, d0[0] - vmla.f32 q9, q4, d2[0] - - subs r7, r7, #1 - bne L8LoopZ - L8LoopZEndRemain: - vmla.f32 q10, q4, d4[0] - vmla.f32 q11, q4, d6[0] - - vmla.f32 q8, q5, d0[1] - vmla.f32 q9, q5, d2[1] - vld1.32 {q6, q7}, [r2]! - vmla.f32 q10, q5, d4[1] - vmla.f32 q11, q5, d6[1] - - vmla.f32 q8, q6, d1[0] - vmla.f32 q9, q6, d3[0] - vmla.f32 q10, q6, d5[0] - vmla.f32 q11, q6, d7[0] - - vmla.f32 q8, q7, d1[1] - vmla.f32 q9, q7, d3[1] - vmla.f32 q10, q7, d5[1] - vld1.32 {q0, q1}, [r1]! - vmla.f32 q11, q7, d7[1] - - vld1.32 {q2, q3}, [r1]! - - vmla.f32 q12, q4, d0[0] - vmla.f32 q13, q4, d2[0] - vmla.f32 q14, q4, d4[0] - vmla.f32 q15, q4, d6[0] - - vmla.f32 q12, q5, d0[1] - vmla.f32 q13, q5, d2[1] - vmla.f32 q14, q5, d4[1] - vmla.f32 q15, q5, d6[1] - - vmla.f32 q12, q6, d1[0] - vmla.f32 q13, q6, d3[0] - vmla.f32 q14, q6, d5[0] - vmla.f32 q15, q6, d7[0] - - vmla.f32 q12, q7, d1[1] - vmla.f32 q13, q7, d3[1] - vmla.f32 q14, q7, d5[1] - vmla.f32 q15, q7, d7[1] - L8LoopZEnd: - vst1.32 {q8, q9}, [r0]! - vst1.32 {q10, q11}, [r0]! - vst1.32 {q12, q13}, [r0]! - vst1.32 {q14, q15}, [r0]! - mov r1, r6 - - subs r5, r5, #1 - add r2, r2, r8 - add r0, r12, r4 - bne L8Dz - - -vpop {q4-q7} - - -pop {r4-r8, pc} - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S b/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S index ea71bdb7..c17f748c 100644 --- a/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S +++ b/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S @@ -16,13 +16,17 @@ asm_function MNNInt8ScaleToFloat -// void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size) +// void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint) push {lr} +ldr r12, [sp, #4] +vdup.s32 q13, r12 +vcvt.f32.s32 q13, q13 + vpush {q4-q7} // Auto Load: -// r0: dst*, r1: src*, r2: scale*, r3: size +// r0: dst*, r1: src*, r2: scale*, r3: size, r4: zeroPoint vld1.32 {q15}, [r2] @@ -40,13 +44,17 @@ L4Loop: vmovl.s16 q3, d11 vmovl.s16 q1, d9 vcvt.f32.s32 q0, q0 + vsub.f32 q0, q13 vmul.f32 q0, q15 vcvt.f32.s32 q1, q1 + vsub.f32 q1, q13 vmul.f32 q1, q15 vst1.32 {q0, q1}, [r0]! vcvt.f32.s32 q2, q2 + vsub.f32 q2, q13 vmul.f32 q2, q15 vcvt.f32.s32 q3, q3 + vsub.f32 q3, q13 vmul.f32 q3, q15 vst1.32 {q2, q3}, [r0]! @@ -63,6 +71,7 @@ L1Loop: vmovl.s16 q0, d8 subs r3, r3, #1 vcvt.f32.s32 q1, q0 + vsub.f32 q1, q13 vmul.f32 q0, q1, q15 vst1.32 {q0}, [r0]! bne L1Loop diff --git a/source/backend/cpu/arm/arm32/MNNPackC4.S b/source/backend/cpu/arm/arm32/MNNPackC4.S index 062bfacc..17c81fd4 100644 --- a/source/backend/cpu/arm/arm32/MNNPackC4.S +++ b/source/backend/cpu/arm/arm32/MNNPackC4.S @@ -6,6 +6,7 @@ // Copyright © 2018, Alibaba Group Holding Limited // + #ifdef __arm__ #ifndef __aarch64__ diff --git a/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S index 6a072326..a8debe0a 100644 --- a/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S +++ b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A.S @@ -13,43 +13,58 @@ .text .align 5 asm_function MNNPackC4ForMatMul_A -//void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) -//Auto: r0: dest, r1:source, r2: e, r3:l, r4: eReal -// eReal -> eReal * 4 * sizeof(float) - 192 +//void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) +//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el push {r4-r11, lr} -ldr r4, [sp, #36] +ldr r10, [r2, #0] // number +ldr r4, [r2, #4] // eReal +ldr r11, [r2, #8] // eDest +ldr r6, [r2, #12] // xOffset +// xOffset -> xOffset * 4 * sizeof(float) +// eReal -> eReal * 4 * sizeof(float) +// eDest -> eDest * sizeof(float) +mov r12, #4 // sizeof(float). kept as a const +mov r9, #16 +mul r4, r9, r4 +mul r11, r12, r11 +mul r6, r9, r6 -mov r9, #4 -mov r12, #16 -mul r4, r12, r4 -mul r8, r9, r2 +LoopNumber: +ldr r5, [r3, #4] // l +ldr r8, [r3, #8] // eOffset +ldr r7, [r3, #12] // lOffset -sub r4, r4, #192 +push {r0, r1} +ldr r1, [r1, #0] -// Set r9 as l * 12 * sizeof(float) -mov r12, #48 -mul r9, r3, r12 +// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float) +mul r7, r11, r7 +mul r8, r12, r8 +add r0, r0, r7 +add r0, r0, r8 + +ldr r2, [r3, #0] // e Body: cmp r2, #12 blt Right - -LoopE12: - mov r6, r0 - mov r7, r1 - mov r5, r3 cmp r5, #4 blt LoopEL3 LoopL4: + mov r2, r1 .macro MAIN_TRANSPOSE - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - - vld1.32 {q8, q9}, [r1]! - vld1.32 {q10, q11}, [r1]! - - vld1.32 {q12, q13}, [r1]! - vld1.32 {q14, q15}, [r1]! + vld1.32 {q0}, [r1], r6 + vld1.32 {q1}, [r1], r6 + vld1.32 {q2}, [r1], r6 + vld1.32 {q3}, [r1], r6 + vld1.32 {q8}, [r1], r6 + vld1.32 {q9}, [r1], r6 + vld1.32 {q10}, [r1], r6 + vld1.32 {q11}, [r1], r6 + vld1.32 {q12}, [r1], r6 + vld1.32 {q13}, [r1], r6 + vld1.32 {q14}, [r1], r6 + vld1.32 {q15}, [r1], r6 vtrn.32 d0, d2 vtrn.32 d1, d3 @@ -93,7 +108,7 @@ LoopE12: vst1.32 {q11}, [r0]! vst1.32 {q15}, [r0]! - add r1, r1, r4 + add r1, r2, r4 sub r5, r5, #4 cmp r5, #4 bge LoopL4 @@ -115,8 +130,7 @@ LoopE12: vst1.32 {q10}, [r0]! vst1.32 {q14}, [r0]! - - sub r5, r5, #3 + b LoopEEnd LoopEL2: cmp r5, #2 @@ -129,41 +143,34 @@ LoopE12: vst1.32 {q1}, [r0]! vst1.32 {q9}, [r0]! vst1.32 {q13}, [r0]! - sub r5, r5, #2 + b LoopEEnd LoopEL1: - cmp r5, #1 - blt LoopEEnd + cmp r5, #0 + beq LoopEEnd MAIN_TRANSPOSE vst1.32 {q0}, [r0]! vst1.32 {q8}, [r0]! vst1.32 {q12}, [r0]! LoopEEnd: - sub r2, r2, #12 - cmp r2, #12 - add r0, r6, r9 - add r1, r7, #192 // 12 * 4 * sizeof(float) - bge LoopE12 +b End -cmp r2, #0 -beq End Right: -add r4, r4, #192 LoopE1: - mov r6, r0 + mov r9, r5 mov r7, r1 - mov r5, r3 + mov r8, r0 cmp r5, #4 blt LoopE1L3 LoopE1L4: vld1.32 {q0}, [r1], r4 - vst1.32 {d0[0]}, [r0], r8 - vst1.32 {d0[1]}, [r0], r8 - vst1.32 {d1[0]}, [r0], r8 - vst1.32 {d1[1]}, [r0], r8 + vst1.32 {d0[0]}, [r0], r11 + vst1.32 {d0[1]}, [r0], r11 + vst1.32 {d1[0]}, [r0], r11 + vst1.32 {d1[1]}, [r0], r11 sub r5, r5, #4 cmp r5, #4 bge LoopE1L4 @@ -172,9 +179,9 @@ LoopE1: cmp r5, #3 blt LoopE1L2 vld1.32 {q0}, [r1], r4 - vst1.32 {d0[0]}, [r0], r8 - vst1.32 {d0[1]}, [r0], r8 - vst1.32 {d1[0]}, [r0], r8 + vst1.32 {d0[0]}, [r0], r11 + vst1.32 {d0[1]}, [r0], r11 + vst1.32 {d1[0]}, [r0], r11 sub r5, r5, #3 @@ -182,25 +189,33 @@ LoopE1: cmp r5, #2 blt LoopE1L1 vld1.32 {d0}, [r1], r4 - vst1.32 {d0[0]}, [r0], r8 - vst1.32 {d0[1]}, [r0], r8 + vst1.32 {d0[0]}, [r0], r11 + vst1.32 {d0[1]}, [r0], r11 sub r5, r5, #2 LoopE1L1: cmp r5, #1 blt LoopE1End vld1.32 {d0[0]}, [r1], r4 - vst1.32 {d0[0]}, [r0], r8 + vst1.32 {d0[0]}, [r0], r11 LoopE1End: subs r2, r2, #1 - add r0, r6, #4 - add r1, r7, #16 // 4 * sizeof(float) + add r0, r8, r12 + add r1, r7, r6 + mov r5, r9 bne LoopE1 End: +pop {r0, r1} +subs r10, r10, #1 +add r3, r3, #16 +add r1, r1, #4 + +bne LoopNumber + pop {r4-r11, pc} #endif diff --git a/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A_BF16.S b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A_BF16.S new file mode 100644 index 00000000..4702e3b9 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNPackC4ForMatMul_A_BF16.S @@ -0,0 +1,208 @@ +// +// NEON_MNNPackC4ForMatMul_A_BF16.S +// MNN +// +// Created by MNN on 2021/02/21. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +asm_function NEON_MNNPackC4ForMatMul_A_BF16 +// treate float pointer as int16_t* +//void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) +//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el +push {r4-r11, lr} +ldr r10, [r2, #0] // number +ldr r4, [r2, #4] // eReal +ldr r11, [r2, #8] // eDest +ldr r6, [r2, #12] // xOffset +// xOffset -> xOffset * 4 * sizeof(float) +// eReal -> eReal * 4 * sizeof(float) +// eDest -> eDest * sizeof(float) +mov r12, #2 // sizeof(int16_t) +mov r9, #8 // sizeof(int16_t) * 4 +mul r4, r9, r4 +mul r11, r12, r11 +mul r6, r9, r6 + +LoopNumber: +ldr r5, [r3, #4] // l +ldr r8, [r3, #8] // eOffset +ldr r7, [r3, #12] // lOffset + +push {r0, r1} +ldr r1, [r1, #0] + +// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float) +; mov r9, #2 //sizeof(int16_t) +mul r7, r11, r7 +mul r8, r12, r8 +add r0, r0, r7 +add r0, r0, r8 + +ldr r2, [r3, #0] // e + +Body: +cmp r2, #12 +blt Right + cmp r5, #4 + blt LoopEL3 + LoopL4: + mov r2, r1 +.macro MAIN_TRANSPOSE + vld1.16 {d16}, [r1], r6 // load size: 4 * sizeof(int16_t) + vld1.16 {d19}, [r1], r6 + vld1.16 {d22}, [r1], r6 + vld1.16 {d25}, [r1], r6 + vld1.16 {d17}, [r1], r6 + vld1.16 {d20}, [r1], r6 + vld1.16 {d23}, [r1], r6 + vld1.16 {d26}, [r1], r6 + vld1.16 {d18}, [r1], r6 + vld1.16 {d21}, [r1], r6 + vld1.16 {d24}, [r1], r6 + vld1.16 {d27}, [r1], r6 + + // transpose each 4 16-bit elements in 2 d_n vectors, by transpose 16-bit and scale up transpose 32-bit. + vtrn.16 d16, d19 + vtrn.16 d22, d25 + // vswp d0[2-3], d2[0-1] + // vswp d1[2-3], d3[0-1] + // swap half of 64-bit is equal to transpose in 32-bit unit. + vtrn.32 d16, d22 + vtrn.32 d19, d25 + + vtrn.16 d17, d20 + vtrn.16 d23, d26 + vtrn.32 d17, d23 + vtrn.32 d20, d26 + + vtrn.16 d18, d21 + vtrn.16 d24, d27 + vtrn.32 d18, d24 + vtrn.32 d21, d27 + // after transpose from 12x4 to 4x12, memory layout is + // +-------+------+------+ + // | d16...|d17...|d18...| + // +-------+------+------+ + // | d19...|d20...|d21...| + // +-------+------+------+ + // | d22...|d23...|d24...| + // +-------+------+------+ + // | d25...|d26...|d27...| + // +-------+------+------+ +.endm + MAIN_TRANSPOSE + + vstm r0!, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27} // store at one time: 12 * 4 * sizeof(int16_t) + + add r1, r2, r4 + sub r5, r5, #4 + cmp r5, #4 + bge LoopL4 + + LoopEL3: + cmp r5, #3 + blt LoopEL2 + MAIN_TRANSPOSE + + vstm r0!, {d16, d17, d18, d19, d20, d21, d22, d23, d24} + + b LoopEEnd + + LoopEL2: + cmp r5, #2 + blt LoopEL1 + MAIN_TRANSPOSE + + vstm r0!, {d16, d17, d18, d19, d20, d21} + + b LoopEEnd + + LoopEL1: + cmp r5, #0 + beq LoopEEnd + MAIN_TRANSPOSE + + vstm r0!, {d16, d17, d18} + + LoopEEnd: + +b End + + +Right: + +LoopE1: + mov r9, r5 + mov r7, r1 + mov r8, r0 + cmp r5, #4 + blt LoopE1L3 + LoopE1L4: + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + vst1.16 {d0[3]}, [r0], r11 + sub r5, r5, #4 + cmp r5, #4 + bge LoopE1L4 + + LoopE1L3: + cmp r5, #3 + blt LoopE1L2 + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + vst1.16 {d0[2]}, [r0], r11 + + sub r5, r5, #3 + + LoopE1L2: + cmp r5, #2 + blt LoopE1L1 + vld1.16 {d0}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + vst1.16 {d0[1]}, [r0], r11 + sub r5, r5, #2 + + LoopE1L1: + cmp r5, #1 + blt LoopE1End + vld1.16 {d0[0]}, [r1], r4 + vst1.16 {d0[0]}, [r0], r11 + + LoopE1End: + + subs r2, r2, #1 + add r0, r8, r12 // !!!! caution : sizeof(int16_t) + add r1, r7, r6 + mov r5, r9 + bne LoopE1 + +End: + +pop {r0, r1} +subs r10, r10, #1 + +// x3 is (const int32_t* el), this array size of 4. as a result for next struct element, +// address added by 4 * sizeof(int32_t) +add r3, r3, #16 + +// x1 is (const int16_t** sourceGroup), even though data content is int16_t, +// the element in sourceGroup in 'int16_t*', as a result for next struct element, +// value added by sizeof(void*) +add r1, r1, #4 + +bne LoopNumber + +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNPackC4_BF16.S b/source/backend/cpu/arm/arm32/MNNPackC4_BF16.S new file mode 100644 index 00000000..e2d60399 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNPackC4_BF16.S @@ -0,0 +1,187 @@ +// +// MNNPackC4_BF16.S +// MNN +// +// Created by MNN on 2021/02/26. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + + + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 + +// .macro transpose +// vtrn.16 d0, d1 +// vtrn.16 d2, d3 +// vswp d0[2-3], d1[2-3] // should swap high half of d-vector, the half is 32-bit. there is no instruction, we use vst4.16 instead +// vswp d2[2-3], d3[2-3] +// .endm + +asm_function MNNPackC4_BF16 +// treate float pointer as int16_t* +//void MNNPackC4_BF16(float* dst, const float* src, size_t area, size_t depth) +//Auto load: +//r0:dst, r1:src, r2:area, r3:depth + + +push {r4, r5, r6, r7, r8, lr} + +mul r4, r2, r3 +cmp r4, #0 +beq UpEnd + +//r4: src DepthOffset:area*sizeof(int16_t) +mov r4, #2 // sizeof(int16_t) +mul r4, r2, r4 + +UpL4: +cmp r3, #3 +ble UpL3 + +UpL4Loop: +add r5, r1, r4 +add r6, r4, r5 +add r7, r4, r6 +mov r8, r2 +cmp r8, #3 +ble UpL4AreaRemain +UpL4AreaLoop: +vld1.16 {d0}, [r1]! // load 4 elements of 16-bit into 64bit vector register d0 +vld1.16 {d1}, [r5]! +vld1.16 {d2}, [r6]! +vld1.16 {d3}, [r7]! +// transpose // no suitable instruction to transpose int16_t type +vst4.16 {d0, d1, d2, d3}, [r0]! +sub r8, r8, #4 +cmp r8, #4 +bge UpL4AreaLoop + +UpL4AreaRemain: +cmp r8, #0 +beq UpL4AreaRemainEnd +UpL4AreaRemainLoop: +vld1.16 {d0[0]}, [r1]! +vld1.16 {d0[1]}, [r5]! +vld1.16 {d1[0]}, [r6]! +vld1.16 {d1[1]}, [r7]! + +vst1.16 {d0}, [r0]! + +subs r8, r8, #1 +bne UpL4AreaRemainLoop +UpL4AreaRemainEnd: +sub r3, r3, #4 +mov r1, r7 +cmp r3, #4 +bge UpL4Loop + +UpL3: +cmp r3, #2 +ble UpL2 +add r5, r1, r4 +add r6, r4, r5 +mov r8, r2 +cmp r8, #3 +ble UpL3AreaRemain +UpL3AreaLoop: +vld1.16 {d0}, [r1]! +vmov.i16 d3, #0 +vld1.16 {d1}, [r5]! +vld1.16 {d2}, [r6]! +// transpose // no suitable instruction to transpose int16_t type +vst4.16 {d0, d1, d2, d3}, [r0]! +sub r8, r8, #4 +cmp r8, #4 +bge UpL3AreaLoop + +cmp r8, #0 +beq UpL3AreaRemainEnd +UpL3AreaRemain: +vmov.i16 d0, #0 +vld1.16 {d0[0]}, [r1]! +vld1.16 {d0[1]}, [r5]! +vld1.16 {d1[0]}, [r6]! + +vst1.16 {d0}, [r0]! + +subs r8, r8, #1 +bne UpL3AreaRemain + +UpL3AreaRemainEnd: +sub r3, r3, #3 + + +UpL2: +cmp r3, #1 +ble UpL1 +add r5, r1, r4 +mov r8, r2 +cmp r8, #3 +ble UpL2AreaRemain +UpL2AreaLoop: +vld1.16 {d0}, [r1]! +vmov.i16 d3, #0 +vld1.16 {d1}, [r5]! +vmov.i16 d2, #0 +// transpose // no suitable instruction to transpose int16_t type +vst4.16 {d0, d1, d2, d3}, [r0]! +sub r8, r8, #4 +cmp r8, #4 +bge UpL2AreaLoop + +cmp r8, #0 +beq UpL2AreaRemainEnd +UpL2AreaRemain: +vmov.i16 d0, #0 +vld1.16 {d0[0]}, [r1]! +vld1.16 {d0[1]}, [r5]! + +vst1.16 {d0}, [r0]! + +subs r8, r8, #1 +bne UpL2AreaRemain + +UpL2AreaRemainEnd: +sub r3, r3, #2 + +UpL1: +cmp r3, #0 +beq UpEnd +mov r8, r2 +cmp r8, #3 +ble UpL1AreaRemain +UpL1AreaLoop: +vld1.16 {d0}, [r1]! +vmov.i16 d3, #0 +vmov.i16 d1, #0 +vmov.i16 d2, #0 +// transpose // no suitable instruction to transpose int16_t type +vst4.16 {d0, d1, d2, d3}, [r0]! +sub r8, r8, #4 +cmp r8, #4 +bge UpL1AreaLoop + +cmp r8, #0 +beq UpL1AreaRemainEnd +UpL1AreaRemain: +vmov.i16 d0, #0 +vld1.16 {d0[0]}, [r1]! + +vst1.16 {d0}, [r0]! + +subs r8, r8, #1 +bne UpL1AreaRemain + +UpL1AreaRemainEnd: + +UpEnd: + +pop {r4, r5, r6, r7, r8, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMul.S b/source/backend/cpu/arm/arm32/MNNPackedMatMul.S index df369101..d0af81b6 100644 --- a/source/backend/cpu/arm/arm32/MNNPackedMatMul.S +++ b/source/backend/cpu/arm/arm32/MNNPackedMatMul.S @@ -12,15 +12,15 @@ .text .align 5 -// 12 * 8 MatMul +// 12 * 4 MatMul asm_function MNNPackedMatMul -//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias); +//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); // Auto: r0: C, r1:A, r2:B, r3:parameter -// Load from sp: r4: cache, no use, r5: postParameters, r6:bias +// Load from sp: r5: postParameters, r6:bias push {r4-r11, lr} -ldr r5, [sp, #40] -ldr r6, [sp, #44] +ldr r5, [sp, #36] +ldr r6, [sp, #40] ldr r4, [r3, #8] // h ldr r7, [r3, #4] // l diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S index 1484a672..f2a4b119 100644 --- a/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S +++ b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain.S @@ -21,8 +21,8 @@ asm_function MNNPackedMatMulRemain push {r4-r11, lr} ldr r4, [sp, #36] -ldr r6, [sp, #44] -ldr r7, [sp, #48] +ldr r6, [sp, #40] +ldr r7, [sp, #44] ldr r12, [r4, #0] cmp r6, #0 beq Start diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain_BF16.S b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain_BF16.S new file mode 100644 index 00000000..7d6b9f2b --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNPackedMatMulRemain_BF16.S @@ -0,0 +1,154 @@ +// +// NEON_MNNPackedMatMulRemain_BF16.S +// MNN +// +// Created by MNN on 2021/02/24. +// Copyright © 2018-2021 Alibaba Group Holding Limited. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 12 * 8 MatMul +asm_function NEON_MNNPackedMatMulRemain_BF16 +// treate float pointer as int16_t* +//void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); +//Auto r0: C, r1:A, r2:B, r3:eSize, +//r4:parameter, r5: cache no usage, r6:postParameters, r7:bias + +push {r4-r11, lr} +ldr r4, [sp, #36] +ldr r6, [sp, #40] +ldr r7, [sp, #44] +ldr r12, [r4, #0] +cmp r6, #0 +beq Start +vld1.32 {q3}, [r6] +vdup.f32 q12, d7[0] // min +vdup.f32 q13, d7[1] // max +Start: +cmp r3, #4 +blt L1 + +LoopE4: + ldr r5, [r4, #8] // h + add r5, r5, #3 + lsr r5, r5, #2 // r5 = UP_DIV(r5, 4) + mov r9, r0 + mov r11, r2 + push {r7} + LoopE4H: + mov r10, r1 + ldr r8, [r4, #4] // l + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + LoopE4L: + vld1.16 {d0}, [r10], r12 + vld1.16 {d2}, [r11]! // load 4 * sizeof(int16_t) + vshll.s16 q0, d0, #16 // shift left long of each int16_t as float32 + vshll.s16 q1, d2, #16 + vmla.f32 q8, q1, d0[0] + vmla.f32 q9, q1, d0[1] + vmla.f32 q10, q1, d1[0] + vmla.f32 q11, q1, d1[1] + subs r8, r8, #1 + bne LoopE4L + cmp r6, #0 + beq StoreE4 + vld1.16 {d28}, [r7]! // load 4 * sizeof(int16_t) + vshll.s16 q14, d28, #16 // shift left long of each int16_t as float32 + vmla.f32 q8, q14, d6[1] + vmla.f32 q9, q14, d6[1] + vmla.f32 q10, q14, d6[1] + vmla.f32 q11, q14, d6[1] + + PostTreatE4: + vmax.f32 q8, q8, q12 + vmax.f32 q9, q9, q12 + vmax.f32 q10, q10, q12 + vmax.f32 q11, q11, q12 + + vmin.f32 q8, q8, q13 + vmin.f32 q9, q9, q13 + vmin.f32 q10, q10, q13 + vmin.f32 q11, q11, q13 + + StoreE4: + ldr r8, [r4, #20] + add r11, r11, r8 + ldr r8, [r4, #12] + + vshrn.i32 d16, q8, #16 // shift right 16bit of each float32 as int16_t + vshrn.i32 d17, q9, #16 + vshrn.i32 d18, q10, #16 + vshrn.i32 d19, q11, #16 + vst1.16 {d16, d17}, [r9]! + vst1.16 {d18, d19}, [r9], r8 + sub r9, r9, #16 + subs r5, r5, #1 // move 4 colum along lP dim. lP = l / 4 + bne LoopE4H + sub r3, r3, #4 // move 4 colum along e dim. + add r0, r0, #32 // move address of 4 * 4 * sizeof(int16_t) + add r1, r1, #8 // move address of 4 * sizeof(int16_t) in src tile block + cmp r3, #4 + pop {r7} + bge LoopE4 + +L1: +cmp r3, #0 +beq End +LoopE1: + ldr r5, [r4, #8] // h + add r5, r5, #3 + lsr r5, r5, #2 + mov r9, r0 + mov r11, r2 + push {r7} + LoopE1H: + mov r10, r1 + ldr r8, [r4, #4] // l + vmov.i32 q15, #0 + LoopE1L: + vld1.16 {d0[0]}, [r10], r12 + vld1.16 {d2}, [r11]! // load 4 * sizeof(int16_t) + vshll.s16 q0, d0, #16 // shift left long of each int16_t as float32 + vshll.s16 q1, d2, #16 + + vmla.f32 q15, q1, d0[0] + subs r8, r8, #1 + bne LoopE1L + cmp r6, #0 + beq StoreE1 + vld1.16 {d28}, [r7]! // load 4 * sizeof(int16_t) + vshll.s16 q14, d28, #16 // shift left long of each int16_t as float32 + vmla.f32 q15, q14, d6[1] + + PostTreatE1: + vmax.f32 q15, q15, q12 + vmin.f32 q15, q15, q13 + + StoreE1: + ldr r8, [r4, #20] + add r11, r11, r8 + ldr r8, [r4, #12] + + vshrn.i32 d30, q15, #16 // shift right 16bit of each float32 as int16_t + vst1.16 {d30}, [r9], r8 + subs r5, r5, #1 + bne LoopE1H + subs r3, r3, #1 + add r0, r0, #8 // move address of 4 * sizeof(int16_t) + add r1, r1, #2 // move address of 1 * sizeof(int16_t) + pop {r7} + bne LoopE1 +End: +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNPackedMatMul_BF16.S b/source/backend/cpu/arm/arm32/MNNPackedMatMul_BF16.S new file mode 100644 index 00000000..2bf8a1a3 --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNPackedMatMul_BF16.S @@ -0,0 +1,211 @@ +// +// NEON_MNNPackedMatMul_BF16.S +// MNN +// +// Created by MNN on 2021/02/24. +// Copyright © 2018-2021 Alibaba Group Holding Limited. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 12 * 8 MatMul +asm_function NEON_MNNPackedMatMul_BF16 +// treate float pointer as int16_t* +//void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); +// Auto: r0: C, r1:A, r2:B, r3:parameter +// Load from sp: r5: postParameters, r6:bias + +push {r4-r11, lr} +ldr r5, [sp, #36] +ldr r6, [sp, #40] + +ldr r4, [r3, #8] // h +ldr r7, [r3, #4] // l +add r4, r4, #3 +ldr r8, [r3, #12]//cStride +ldr r3, [r3, #20]//bExtraStride +lsr r4, r4, #2 + +sub r8, r8, #96 // after segment "Store", total line stride is CStride, all vst. offset is 12 * 4 * size_t(int16_t) = 96byte + +vpush {q4-q7} +// q0, q1, q2: src +// q3: weight +// q4 - q15: dst + +LoopH: + subs r12, r7, #1 + mov r11, r1 + vld1.16 {d6}, [r2]! + vld1.16 {d0, d1}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q3, d6, #16 // shift left long of each int16_t as float32 + vshll.s16 q1, d1, #16 // !! caution: must shll d1 before d0 + vshll.s16 q0, d0, #16 + + vmul.f32 q4, q3, d0[0] + vmul.f32 q5, q3, d0[1] + vmul.f32 q6, q3, d1[0] + vld1.16 {d4}, [r11]! // load 4 * sizeof(int16_t) + vshll.s16 q2, d4, #16 + vmul.f32 q7, q3, d1[1] + + vmul.f32 q8, q3, d2[0] + vmul.f32 q9, q3, d2[1] + vmul.f32 q10, q3, d3[0] + vmul.f32 q11, q3, d3[1] + + vmul.f32 q12, q3, d4[0] + vmul.f32 q13, q3, d4[1] + vmul.f32 q14, q3, d5[0] + vmul.f32 q15, q3, d5[1] + beq LoopLEnd + LoopL: + vld1.16 {d6}, [r2]! + vld1.16 {d0, d1}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q3, d6, #16 // shift left long of each int16_t as float32 + vshll.s16 q1, d1, #16 // !! caution: must shll d1 before d0 + vshll.s16 q0, d0, #16 + + vmla.f32 q4, q3, d0[0] + vmla.f32 q5, q3, d0[1] + vmla.f32 q6, q3, d1[0] + vld1.16 {d4}, [r11]! + vshll.s16 q2, d4, #16 + + vmla.f32 q7, q3, d1[1] + + vmla.f32 q8, q3, d2[0] + vmla.f32 q9, q3, d2[1] + vmla.f32 q10, q3, d3[0] + vmla.f32 q11, q3, d3[1] + + vmla.f32 q12, q3, d4[0] + vmla.f32 q13, q3, d4[1] + vmla.f32 q14, q3, d5[0] + vmla.f32 q15, q3, d5[1] + + subs r12, r12, #1 + bne LoopL + LoopLEnd: + cmp r5, #0 + beq Store + vld1.32 {q0}, [r5] // parameter remains float + cmp r6, #0 + beq LoadOrigin + vld1.16 {d6}, [r6]! // load 4 * sizeof(int16_t) + vshll.s16 q3, d6, #16 // shift left long of each int16_t as int32_t + vmla.f32 q4, q3, d0[1] + vmla.f32 q5, q3, d0[1] + vmla.f32 q6, q3, d0[1] + vmla.f32 q7, q3, d0[1] + vmla.f32 q8, q3, d0[1] + vmla.f32 q9, q3, d0[1] + vmla.f32 q10, q3, d0[1] + vmla.f32 q11, q3, d0[1] + vmla.f32 q12, q3, d0[1] + vmla.f32 q13, q3, d0[1] + vmla.f32 q14, q3, d0[1] + vmla.f32 q15, q3, d0[1] + + b PostTreat + + LoadOrigin: + mov r11, r0 + vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t + vshll.s16 q1, d2, #16 + vmla.f32 q4, q1, d0[1] + vmla.f32 q5, q2, d0[1] + + vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t + vshll.s16 q1, d2, #16 + vmla.f32 q6, q1, d0[1] + vmla.f32 q7, q2, d0[1] + + vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t + vshll.s16 q1, d2, #16 + vmla.f32 q8, q1, d0[1] + vmla.f32 q9, q2, d0[1] + + vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t + vshll.s16 q1, d2, #16 + vmla.f32 q10, q1, d0[1] + vmla.f32 q11, q2, d0[1] + + vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t + vshll.s16 q1, d2, #16 + vmla.f32 q12, q1, d0[1] + vmla.f32 q13, q2, d0[1] + + vld1.16 {d2, d3}, [r11]! // load 2 * 4 * sizeof(int16_t) + vshll.s16 q2, d3, #16 // shift left long of each int16_t as int32_t + vshll.s16 q1, d2, #16 + vmla.f32 q14, q1, d0[1] + vmla.f32 q15, q2, d0[1] + + PostTreat: + vdup.f32 q2, d1[0] // min + vdup.f32 q1, d1[1] // max + + vmax.f32 q4, q4, q2 + vmax.f32 q5, q5, q2 + vmax.f32 q6, q6, q2 + vmax.f32 q7, q7, q2 + vmax.f32 q8, q8, q2 + vmax.f32 q9, q9, q2 + vmax.f32 q10, q10, q2 + vmax.f32 q11, q11, q2 + vmax.f32 q12, q12, q2 + vmax.f32 q13, q13, q2 + vmax.f32 q14, q14, q2 + vmax.f32 q15, q15, q2 + + vmin.f32 q4, q4, q1 + vmin.f32 q5, q5, q1 + vmin.f32 q6, q6, q1 + vmin.f32 q7, q7, q1 + vmin.f32 q8, q8, q1 + vmin.f32 q9, q9, q1 + vmin.f32 q10, q10, q1 + vmin.f32 q11, q11, q1 + vmin.f32 q12, q12, q1 + vmin.f32 q13, q13, q1 + vmin.f32 q14, q14, q1 + vmin.f32 q15, q15, q1 + + Store: + vshrn.i32 d8, q4, #16 // !!caution: these instructions has relying, eg: d10 must be written after reading q5. shift right 16bit of each float32 as int16_t + vshrn.i32 d9, q5, #16 + vshrn.i32 d10, q6, #16 + vshrn.i32 d11, q7, #16 + vshrn.i32 d12, q8, #16 + vshrn.i32 d13, q9, #16 + vshrn.i32 d14, q10, #16 + vshrn.i32 d15, q11, #16 + vshrn.i32 d16, q12, #16 + vshrn.i32 d17, q13, #16 + vshrn.i32 d18, q14, #16 + vshrn.i32 d19, q15, #16 + + vstm r0!, {d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19} + + add r0, r0, r8 + add r2, r2, r3 + + subs r4, r4, #1 + bne LoopH + +vpop {q4-q7} +pop {r4-r11, pc} + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNUnPackC4_BF16.S b/source/backend/cpu/arm/arm32/MNNUnPackC4_BF16.S new file mode 100644 index 00000000..053906cd --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNUnPackC4_BF16.S @@ -0,0 +1,184 @@ +// +// NEON_MNNUnPackC4_BF16.S +// MNN +// +// Created by MNN on 2021/02/24. +// Copyright © 2018-2021 Alibaba Group Holding Limited. +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" +.text +.align 5 + +// .macro transpose +// vtrn.16 d0, d1 +// vtrn.16 d2, d3 +// vswp d0[2-3], d1[2-3] // should swap high half of d-vector, the half length is 32-bit. there is no instruction, we use vld4.16 instead +// vswp d2[2-3], d3[2-3] +// .endm + + +asm_function NEON_MNNUnpackC4_BF16 +// treate float pointer as int16_t* +//void NEON_MNNUnpackC4_BF16(float* dst, const float* src, size_t area, size_t depth); +//Auto load: +//r0:dst, r1:src, r2:area, r3:depth + + +push {r4, r5, r6, r7, r8, lr} +mul r4, r2, r3 +cmp r4, #0 +beq DownEnd + +//Swap r0 and r1 for conviniense +mov r4, r0 +mov r0, r1 +mov r1, r4 + +//r4: srcDepthOffset:area*sizeof(int16_t) +mov r4, #2 // sizeof(int16_t) +mul r4, r2, r4 + +DownL4: +cmp r3, #3 +ble DownL3 + +DownL4Loop: +add r5, r1, r4 +add r6, r4, r5 +add r7, r4, r6 +mov r8, r2 +cmp r8, #3 +ble DownL4AreaRemain +DownL4AreaLoop: +vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t +// transpose // no suitable instruction to transpose int16_t type +sub r8, r8, #4 +vst1.16 {d0}, [r1]! +vst1.16 {d1}, [r5]! +vst1.16 {d2}, [r6]! +vst1.16 {d3}, [r7]! +cmp r8, #4 +bge DownL4AreaLoop + +DownL4AreaRemain: +cmp r8, #0 +beq DownL4AreaRemainEnd +DownL4AreaRemainLoop: + +vld1.16 {d0}, [r0]! + +vst1.16 {d0[0]}, [r1]! +vst1.16 {d0[1]}, [r5]! +vst1.16 {d1[0]}, [r6]! +vst1.16 {d1[1]}, [r7]! + +subs r8, r8, #1 +bne DownL4AreaRemainLoop +DownL4AreaRemainEnd: +sub r3, r3, #4 +mov r1, r7 +cmp r3, #4 +bge DownL4Loop + +DownL3: +cmp r3, #2 +ble DownL2 +add r5, r1, r4 +add r6, r4, r5 +mov r8, r2 +cmp r8, #3 +ble DownL3AreaRemain +DownL3AreaLoop: +vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t +// transpose +sub r8, r8, #4 +vst1.16 {d0}, [r1]! +vst1.16 {d1}, [r5]! +vst1.16 {d2}, [r6]! +cmp r8, #4 +bge DownL3AreaLoop + +cmp r8, #0 +beq DownL3AreaRemainEnd +DownL3AreaRemain: +vld1.16 {d0}, [r0]! + +vst1.16 {d0[0]}, [r1]! +vst1.16 {d0[1]}, [r5]! +vst1.16 {d1[0]}, [r6]! + +subs r8, r8, #1 +bne DownL3AreaRemain + +DownL3AreaRemainEnd: +sub r3, r3, #3 + + +DownL2: +cmp r3, #1 +ble DownL1 +add r5, r1, r4 +mov r8, r2 +cmp r8, #3 +ble DownL2AreaRemain +DownL2AreaLoop: +vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t +// transpose +vst1.16 {d0}, [r1]! +vst1.16 {d1}, [r5]! +sub r8, r8, #4 +cmp r8, #4 +bge DownL2AreaLoop + +cmp r8, #0 +beq DownL2AreaRemainEnd +DownL2AreaRemain: +vld1.16 {d0}, [r0]! +vst1.16 {d0[0]}, [r1]! +vst1.16 {d0[1]}, [r5]! + +subs r8, r8, #1 +bne DownL2AreaRemain + +DownL2AreaRemainEnd: +sub r3, r3, #2 + +DownL1: +cmp r3, #0 +beq DownEnd +mov r8, r2 +cmp r8, #3 +ble DownL1AreaRemain +DownL1AreaLoop: +vld4.16 {d0, d1, d2, d3}, [r0]! // load and transpose 4x4 matrix of int16_t +// transpose +vst1.16 {d0}, [r1]! +sub r8, r8, #4 +cmp r8, #4 +bge DownL1AreaLoop + +cmp r8, #0 +beq DownL1AreaRemainEnd +DownL1AreaRemain: +vld1.16 {d0}, [r0]! + +vst1.16 {d0[0]}, [r1]! +subs r8, r8, #1 +bne DownL1AreaRemain + +DownL1AreaRemainEnd: + +DownEnd: + + + +pop {r4, r5, r6, r7, r8, pc} + + + +#endif +#endif diff --git a/source/backend/cpu/arm/arm64/MNNAddBias.S b/source/backend/cpu/arm/arm64/MNNAddBias.S deleted file mode 100644 index ab55c060..00000000 --- a/source/backend/cpu/arm/arm64/MNNAddBias.S +++ /dev/null @@ -1,67 +0,0 @@ -// -// MNNAddBias.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNAddBias -//void MNNAddBias(float* dst, const float* bias, int planeNumber, int biasNumber) -//x0:dst, x1:bias, x2:planeNumber, x3:biasNumber - -cmp x3, #0 -beq End - -cmp x2, #0 -beq End - -LoopBias: -ld1 {v31.4s}, [x1], #16 - -mov x4, x2 - -L4: -cmp x4, #3 -ble L1 -Loop4: -mov x5, x0 -ld1 {v0.4s, v1.4s}, [x5], #32 -fadd v0.4s, v0.4s, v31.4s -ld1 {v2.4s, v3.4s}, [x5] -fadd v1.4s, v1.4s, v31.4s -fadd v2.4s, v2.4s, v31.4s -st1 {v0.4s, v1.4s}, [x0], #32 -fadd v3.4s, v3.4s, v31.4s -st1 {v2.4s, v3.4s}, [x0], #32 -sub x4, x4, #4 -cmp x4, #4 -bge Loop4 - -L1: -cmp x4, #0 -beq EndLoopPlane -Loop1: -ld1 {v0.4s}, [x0] -fadd v0.4s, v0.4s, v31.4s -subs x4, x4, #1 -st1 {v0.4s}, [x0], #16 -bne Loop1 - -EndLoopPlane: - -subs x3, x3, #1 -bne LoopBias - - -End: - -ret - -#endif diff --git a/source/backend/cpu/arm/arm64/MNNAddBiasRelu.S b/source/backend/cpu/arm/arm64/MNNAddBiasRelu.S deleted file mode 100644 index 6dd8a62d..00000000 --- a/source/backend/cpu/arm/arm64/MNNAddBiasRelu.S +++ /dev/null @@ -1,71 +0,0 @@ -// -// MNNAddBiasRelu.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNAddBiasRelu -//void MNNAddBiasRelu(float* dst, const float* bias, int planeNumber, int biasNumber) -//x0:dst, x1:bias, x2:planeNumber, x3:biasNumber -cmp x3, #0 -beq BiasReluEnd - -cmp x2, #0 -beq BiasReluEnd - -movi v22.4s, #0 -ReluLoopBias: -ld1 {v23.4s}, [x1], #16 - -mov x4, x2 - -ReluBiasReluL4: -cmp x4, #3 -ble BiasReluL1 -ReluLoop4: -mov x5, x0 -ld1 {v0.4s, v1.4s}, [x5], #32 -fadd v0.4s, v0.4s, v23.4s -fadd v1.4s, v1.4s, v23.4s -ld1 {v2.4s, v3.4s}, [x5] -fmax v0.4s, v0.4s, v22.4s -fmax v1.4s, v1.4s, v22.4s -fadd v2.4s, v2.4s, v23.4s -st1 {v0.4s, v1.4s}, [x0], #32 -fmax v2.4s, v2.4s, v22.4s -fadd v3.4s, v3.4s, v23.4s -fmax v3.4s, v3.4s, v22.4s -st1 {v2.4s, v3.4s}, [x0], #32 -sub x4, x4, #4 -cmp x4, #4 -bge ReluLoop4 - -BiasReluL1: -cmp x4, #0 -beq EndReluLoopPlane -ReluLoop1: -ld1 {v0.4s}, [x0] -fadd v0.4s, v0.4s, v23.4s -fmax v0.4s, v0.4s, v22.4s -subs x4, x4, #1 -st1 {v0.4s}, [x0], #16 -bne ReluLoop1 - -EndReluLoopPlane: - -subs x3, x3, #1 -bne ReluLoopBias - - -BiasReluEnd: -ret - -#endif diff --git a/source/backend/cpu/arm/arm64/MNNAddBiasRelu6.S b/source/backend/cpu/arm/arm64/MNNAddBiasRelu6.S deleted file mode 100644 index cf645462..00000000 --- a/source/backend/cpu/arm/arm64/MNNAddBiasRelu6.S +++ /dev/null @@ -1,79 +0,0 @@ -// -// MNNAddBiasRelu6.S -// MNN -// -// Created by MNN on 2019/01/22. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 -asm_function MNNAddBiasRelu6 -//void MNNAddBiasRelu6(float* dst, const float* bias, int planeNumber, int biasNumber) -//x0:dst, x1:bias, x2:planeNumber, x3:biasNumber -cmp x3, #0 -beq BiasReluEnd - -cmp x2, #0 -beq BiasReluEnd - -movi v22.4s, #0 -movi v21.4s, #6 -scvtf v21.4s, v21.4s - -ReluLoopBias: - ld1 {v23.4s}, [x1], #16 - - mov x4, x2 - - ReluBiasReluL4: - cmp x4, #3 - ble BiasReluL1 - ReluLoop4: - mov x5, x0 - ld1 {v0.4s, v1.4s}, [x5], #32 - fadd v0.4s, v0.4s, v23.4s - fadd v1.4s, v1.4s, v23.4s - ld1 {v2.4s, v3.4s}, [x5] - fmax v0.4s, v0.4s, v22.4s - fmax v1.4s, v1.4s, v22.4s - fmin v0.4s, v0.4s, v21.4s - fmin v1.4s, v1.4s, v21.4s - fadd v2.4s, v2.4s, v23.4s - st1 {v0.4s, v1.4s}, [x0], #32 - fmax v2.4s, v2.4s, v22.4s - fadd v3.4s, v3.4s, v23.4s - fmin v2.4s, v2.4s, v21.4s - fmax v3.4s, v3.4s, v22.4s - fmin v3.4s, v3.4s, v21.4s - st1 {v2.4s, v3.4s}, [x0], #32 - sub x4, x4, #4 - cmp x4, #4 - bge ReluLoop4 - - BiasReluL1: - cmp x4, #0 - beq EndReluLoopPlane - ReluLoop1: - ld1 {v0.4s}, [x0] - fadd v0.4s, v0.4s, v23.4s - fmax v0.4s, v0.4s, v22.4s - fmin v0.4s, v0.4s, v21.4s - subs x4, x4, #1 - st1 {v0.4s}, [x0], #16 - bne ReluLoop1 - - EndReluLoopPlane: - - subs x3, x3, #1 - bne ReluLoopBias - - -BiasReluEnd: -ret - -#endif diff --git a/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S index d6583698..025efcf9 100644 --- a/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S +++ b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4.S @@ -1,5 +1,5 @@ // -// MNNMatrixSub.S +// MNNAxByClampBroadcastUnit.S // MNN // // Created by MNN on 2020/06/20. @@ -13,13 +13,14 @@ .text .align 5 -asm_function MNNAxByClampBroadcastC4 -//void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) +asm_function MNNAxByClampBroadcastUnit +//void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) //Auto: x0: C, x1:A, x2:B, x3:width //x4:cStride, x5:aStride, x6:height, x7:parameters -ld1 {v7.4s}, [x7] -dup v30.4s, v7.s[2] -dup v31.4s, v7.s[3] +ld4r {v28.4s, v29.4s, v30.4s, v31.4s}, [x7] +// ld1 {v7.4s}, [x7] +// dup v30.4s, v7.s[2] +// dup v31.4s, v7.s[3] mov x12, #4 //sizeof(float) mul x4, x12, x4 mul x5, x12, x5 @@ -40,17 +41,17 @@ cmp x11, #8 ldp q16, q17, [x1], #32 ldp q18, q19, [x1], #32 -fmla v16.4s, v6.4s, v7.s[1] -fmla v17.4s, v6.4s, v7.s[1] +fmla v16.4s, v6.4s, v29.4s +fmla v17.4s, v6.4s, v29.4s ldp q20, q21, [x1], #32 -fmla v18.4s, v6.4s, v7.s[1] -fmla v19.4s, v6.4s, v7.s[1] +fmla v18.4s, v6.4s, v29.4s +fmla v19.4s, v6.4s, v29.4s ldp q22, q23, [x1], #32 -fmla v20.4s, v6.4s, v7.s[1] -fmla v21.4s, v6.4s, v7.s[1] -fmla v22.4s, v6.4s, v7.s[1] -fmla v23.4s, v6.4s, v7.s[1] +fmla v20.4s, v6.4s, v29.4s +fmla v21.4s, v6.4s, v29.4s +fmla v22.4s, v6.4s, v29.4s +fmla v23.4s, v6.4s, v29.4s blt L8ComputeEnd @@ -64,34 +65,34 @@ fmax v20.4s, v20.4s, v30.4s fmax v21.4s, v21.4s, v30.4s fmax v22.4s, v22.4s, v30.4s fmax v23.4s, v23.4s, v30.4s - +add x0, x0, #(32 * 4) +add x1, x1, #(32 * 4) fmin v16.4s, v16.4s, v31.4s fmin v17.4s, v17.4s, v31.4s fmin v18.4s, v18.4s, v31.4s fmin v19.4s, v19.4s, v31.4s -stp q16, q17, [x0], #32 fmin v20.4s, v20.4s, v31.4s fmin v21.4s, v21.4s, v31.4s -stp q18, q19, [x0], #32 fmin v22.4s, v22.4s, v31.4s -ldp q16, q17, [x1], #32 fmin v23.4s, v23.4s, v31.4s -ldp q18, q19, [x1], #32 -fmla v16.4s, v6.4s, v7.s[1] -fmla v17.4s, v6.4s, v7.s[1] -stp q20, q21, [x0], #32 -fmla v18.4s, v6.4s, v7.s[1] -stp q22, q23, [x0], #32 -fmla v19.4s, v6.4s, v7.s[1] -ldp q20, q21, [x1], #32 -ldp q22, q23, [x1], #32 - -fmla v20.4s, v6.4s, v7.s[1] -fmla v21.4s, v6.4s, v7.s[1] -fmla v22.4s, v6.4s, v7.s[1] -fmla v23.4s, v6.4s, v7.s[1] +stp q16, q17, [x0, #-(32 * 4)] +ldp q16, q17, [x1, #-(32 * 4)] +stp q18, q19, [x0, #-(32 * 3)] +ldp q18, q19, [x1, #-(32 * 3)] +stp q20, q21, [x0, #-(32 * 2)] +ldp q20, q21, [x1, #-(32 * 2)] +stp q22, q23, [x0, #-(32 * 1)] +ldp q22, q23, [x1, #-(32 * 1)] +fmla v16.4s, v6.4s, v29.4s +fmla v17.4s, v6.4s, v29.4s +fmla v18.4s, v6.4s, v29.4s +fmla v19.4s, v6.4s, v29.4s +fmla v20.4s, v6.4s, v29.4s +fmla v21.4s, v6.4s, v29.4s +fmla v22.4s, v6.4s, v29.4s +fmla v23.4s, v6.4s, v29.4s sub x11, x11, #8 cmp x11, #8 @@ -107,7 +108,7 @@ fmax v20.4s, v20.4s, v30.4s fmax v21.4s, v21.4s, v30.4s fmax v22.4s, v22.4s, v30.4s fmax v23.4s, v23.4s, v30.4s - +add x0, x0, #(32 * 4) fmin v16.4s, v16.4s, v31.4s fmin v17.4s, v17.4s, v31.4s fmin v18.4s, v18.4s, v31.4s @@ -116,11 +117,10 @@ fmin v20.4s, v20.4s, v31.4s fmin v21.4s, v21.4s, v31.4s fmin v22.4s, v22.4s, v31.4s fmin v23.4s, v23.4s, v31.4s -stp q16, q17, [x0], #32 -stp q18, q19, [x0], #32 - -stp q20, q21, [x0], #32 -stp q22, q23, [x0], #32 +stp q16, q17, [x0, #-(32 * 4)] +stp q18, q19, [x0, #-(32 * 3)] +stp q20, q21, [x0, #-(32 * 2)] +stp q22, q23, [x0, #-(32 * 1)] L1: cmp x11, #0 @@ -128,7 +128,7 @@ beq EndLine L1Loop: ld1 {v0.4s}, [x1], #16 -fmla v0.4s, v6.4s, v7.s[1] +fmla v0.4s, v6.4s, v29.4s fmax v0.4s, v0.4s, v30.4s fmin v0.4s, v0.4s, v31.4s st1 {v0.4s}, [x0], #16 diff --git a/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4_BF16.S b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4_BF16.S new file mode 100644 index 00000000..4992ce8a --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNAxByClampBroadcastC4_BF16.S @@ -0,0 +1,192 @@ +// +// NEON_MNNAxByClampBroadcastC4_BF16.S +// MNN +// +// Created by MNN on 2021/03/09. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNAxByClampBroadcastC4_BF16 +//void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) +//Auto: x0: C, x1:A, x2:B, x3:width +//x4:cStride, x5:aStride, x6:height, x7:parameters +ld4r {v28.4s, v29.4s, v30.4s, v31.4s}, [x7] +// ld1 {v7.4s}, [x7] +// dup v30.4s, v7.s[2] +// dup v31.4s, v7.s[3] +mov x12, #2 //sizeof(int16_t) +mul x4, x12, x4 +mul x5, x12, x5 + +LoopY: +mov x8, x0 +mov x9, x1 +ld1 {v6.4h}, [x2], #8 // 4 * sizeof(int16_t) +shll v6.4s, v6.4h, #16 + +mov x11, x3 + +L8: +cmp x11, #8 +blt L1 + +sub x11, x11, #8 +cmp x11, #8 +ldp d16, d17, [x1], #16 // 4 * 2 * sizeof(int16_t) +ldp d18, d19, [x1], #16 // 4 * 2 * sizeof(int16_t) +ldp d20, d21, [x1], #16 +ldp d22, d23, [x1], #16 + +shll v16.4s, v16.4h, #16 +shll v17.4s, v17.4h, #16 +shll v18.4s, v18.4h, #16 +shll v19.4s, v19.4h, #16 +shll v20.4s, v20.4h, #16 +shll v21.4s, v21.4h, #16 +shll v22.4s, v22.4h, #16 +shll v23.4s, v23.4h, #16 + +fmla v16.4s, v6.4s, v29.4s +fmla v17.4s, v6.4s, v29.4s +fmla v18.4s, v6.4s, v29.4s +fmla v19.4s, v6.4s, v29.4s +fmla v20.4s, v6.4s, v29.4s +fmla v21.4s, v6.4s, v29.4s +fmla v22.4s, v6.4s, v29.4s +fmla v23.4s, v6.4s, v29.4s + +blt L8ComputeEnd + +L8Loop: + +fmax v16.4s, v16.4s, v30.4s +fmax v17.4s, v17.4s, v30.4s +fmax v18.4s, v18.4s, v30.4s +fmax v19.4s, v19.4s, v30.4s +fmax v20.4s, v20.4s, v30.4s +fmax v21.4s, v21.4s, v30.4s +fmax v22.4s, v22.4s, v30.4s +fmax v23.4s, v23.4s, v30.4s + +add x0, x0, #(16 * 4) +add x1, x1, #(16 * 4) + +fmin v16.4s, v16.4s, v31.4s +fmin v17.4s, v17.4s, v31.4s +fmin v18.4s, v18.4s, v31.4s +fmin v19.4s, v19.4s, v31.4s + +fmin v20.4s, v20.4s, v31.4s +fmin v21.4s, v21.4s, v31.4s +fmin v22.4s, v22.4s, v31.4s +fmin v23.4s, v23.4s, v31.4s + +shrn v16.4h, v16.4s, #16 +shrn v17.4h, v17.4s, #16 +shrn v18.4h, v18.4s, #16 +shrn v19.4h, v19.4s, #16 +shrn v20.4h, v20.4s, #16 +shrn v21.4h, v21.4s, #16 +shrn v22.4h, v22.4s, #16 +shrn v23.4h, v23.4s, #16 + +stp d16, d17, [x0, #-(16 * 4)] +ldp d16, d17, [x1, #-(16 * 4)] // 4 * 2 * sizeof(int16_t) +stp d18, d19, [x0, #-(16 * 3)] +ldp d18, d19, [x1, #-(16 * 3)] // 4 * 2 * sizeof(int16_t) +stp d20, d21, [x0, #-(16 * 2)] +ldp d20, d21, [x1, #-(16 * 2)] +stp d22, d23, [x0, #-(16 * 1)] +ldp d22, d23, [x1, #-(16 * 1)] + +shll v16.4s, v16.4h, #16 +shll v17.4s, v17.4h, #16 +shll v18.4s, v18.4h, #16 +shll v19.4s, v19.4h, #16 +shll v20.4s, v20.4h, #16 +shll v21.4s, v21.4h, #16 +shll v22.4s, v22.4h, #16 +shll v23.4s, v23.4h, #16 + +fmla v16.4s, v6.4s, v29.4s +fmla v17.4s, v6.4s, v29.4s +fmla v18.4s, v6.4s, v29.4s +fmla v19.4s, v6.4s, v29.4s +fmla v20.4s, v6.4s, v29.4s +fmla v21.4s, v6.4s, v29.4s +fmla v22.4s, v6.4s, v29.4s +fmla v23.4s, v6.4s, v29.4s + + +sub x11, x11, #8 +cmp x11, #8 +bge L8Loop + +L8ComputeEnd: + +fmax v16.4s, v16.4s, v30.4s +fmax v17.4s, v17.4s, v30.4s +fmax v18.4s, v18.4s, v30.4s +fmax v19.4s, v19.4s, v30.4s +fmax v20.4s, v20.4s, v30.4s +fmax v21.4s, v21.4s, v30.4s +fmax v22.4s, v22.4s, v30.4s +fmax v23.4s, v23.4s, v30.4s +add x0, x0, #(16 * 4) +fmin v16.4s, v16.4s, v31.4s +fmin v17.4s, v17.4s, v31.4s +fmin v18.4s, v18.4s, v31.4s +fmin v19.4s, v19.4s, v31.4s +fmin v20.4s, v20.4s, v31.4s +fmin v21.4s, v21.4s, v31.4s +fmin v22.4s, v22.4s, v31.4s +fmin v23.4s, v23.4s, v31.4s + +shrn v16.4h, v16.4s, #16 +shrn v17.4h, v17.4s, #16 +shrn v18.4h, v18.4s, #16 +shrn v19.4h, v19.4s, #16 +shrn v20.4h, v20.4s, #16 +shrn v21.4h, v21.4s, #16 +shrn v22.4h, v22.4s, #16 +shrn v23.4h, v23.4s, #16 + +stp d16, d17, [x0, #-(16 * 4)] +stp d18, d19, [x0, #-(16 * 3)] +stp d20, d21, [x0, #-(16 * 2)] +stp d22, d23, [x0, #-(16 * 1)] + +L1: +cmp x11, #0 +beq EndLine + +L1Loop: +ld1 {v0.4h}, [x1], #8 +shll v0.4s, v0.4h, #16 + +fmla v0.4s, v6.4s, v29.4s +fmax v0.4s, v0.4s, v30.4s +fmin v0.4s, v0.4s, v31.4s + +shrn v0.4h, v0.4s, #16 +st1 {v0.4h}, [x0], #8 +subs x11, x11, #1 +bne L1Loop + +EndLine: +add x0, x8, x4 +add x1, x9, x5 + +subs x6, x6, #1 +bne LoopY + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S b/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S index c559cd9e..5f8a8744 100644 --- a/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S +++ b/source/backend/cpu/arm/arm64/MNNConvDwF23MulTransUnit.S @@ -56,7 +56,8 @@ fadd v0.4s, v0.4s, v2.4s fadd v3.4s, v3.4s, v1.4s fsub v1.4s, v3.4s, v2.4s -st1 {v0.4s, v1.4s}, [x2], #32 +// st1 {v0.4s, v1.4s}, [x2], #32 +stp q0, q1, [x2], #32 sub x3, x3, #2 cmp x3, #2 diff --git a/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S b/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S index b81e2988..6f606e3d 100644 --- a/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S +++ b/source/backend/cpu/arm/arm64/MNNConvDwF23SourceTransUnit.S @@ -31,10 +31,12 @@ beq L1LoopEnd L1Loop: fsub v2.4s, v18.4s, v17.4s - st1 {v0.4s, v1.4s}, [x1], #32 + // st1 {v0.4s, v1.4s}, [x1], #32 + stp q0, q1, [x1], #32 fsub v3.4s, v19.4s, v17.4s mov v16.16b, v18.16b - st1 {v2.4s, v3.4s}, [x1], #32 + // st1 {v2.4s, v3.4s}, [x1], #32 + stp q2, q3, [x1], #32 mov v17.16b, v19.16b ld1 {v18.4s, v19.4s}, [x0], #32 fsub v0.4s, v16.4s, v18.4s @@ -46,8 +48,10 @@ L1LoopEnd: fsub v2.4s, v18.4s, v17.4s fsub v3.4s, v19.4s, v17.4s -st1 {v0.4s, v1.4s}, [x1], #32 -st1 {v2.4s, v3.4s}, [x1], #32 +// st1 {v0.4s, v1.4s}, [x1], #32 +// st1 {v2.4s, v3.4s}, [x1], #32 +stp q0, q1, [x1], #32 +stp q2, q3, [x1], #32 End: diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise_BF16.S b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise_BF16.S new file mode 100644 index 00000000..7427d486 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise_BF16.S @@ -0,0 +1,380 @@ +// +// NEON_MNNConvRunForLineDepthwise_BF16.S +// MNN +// +// Created by MNN on 2021/03/09. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNConvRunForLineDepthwise_BF16 +//void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) + +//Auto Load: +//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step + +//Load From sp: +//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep +ldr x8, [sp, #0] +ldr x15, [sp, #8] +ldr x10, [sp, #16] +ldr x11, [sp, #24] + +mov x9, #2 // sizeof(int16_t) +mul x4, x9, x4 // x4(src_w_setup in byte) = sizeof(int16_t) * src_w_setup +mul x7, x9, x7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step +mul x8, x9, x8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step +mul x10, x9, x10 +mul x11, x9, x11 + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul x9, x5, x7 +sub x8, x8, x9 + +LoopDY: +mov v4.d[0], x10 +mov v4.d[1], x11 +mov v5.d[0], x0 +mov v5.d[1], x1 +mov v6.d[0], x3 + +L16: +cmp x3, #16 // calculate 16 elements along width dim +blt L8 + +mov x12, #16 +mul x12, x4, x12 // 16 * sizeof(int16_t) * src_w_setup + +L16Loop: + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 + movi v20.4s, #0 + movi v21.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L16LoopH: + mov x10, x5 + L16LoopW: + ld1 {v7.4h}, [x2], #8 // 4 * sizeof(int16_t) + ld1 {v0.4h}, [x1], x4 + shll v7.4s, v7.4h, #16 + shll v0.4s, v0.4h, #16 + + subs x10, x10, #1 + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + fmla v16.4s, v7.4s, v0.4s + fmla v17.4s, v7.4s, v1.4s + ld1 {v2.4h}, [x1], x4 + ld1 {v3.4h}, [x1], x4 + shll v2.4s, v2.4h, #16 + shll v3.4s, v3.4h, #16 + fmla v18.4s, v7.4s, v2.4s + fmla v19.4s, v7.4s, v3.4s + ld1 {v0.4h}, [x1], x4 + ld1 {v1.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + fmla v20.4s, v7.4s, v0.4s + fmla v21.4s, v7.4s, v1.4s + ld1 {v2.4h}, [x1], x4 + ld1 {v3.4h}, [x1], x4 + shll v2.4s, v2.4h, #16 + shll v3.4s, v3.4h, #16 + fmla v22.4s, v7.4s, v2.4s + fmla v23.4s, v7.4s, v3.4s + + ld1 {v0.4h}, [x1], x4 + ld1 {v1.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + + fmla v24.4s, v7.4s, v0.4s + fmla v25.4s, v7.4s, v1.4s + ld1 {v2.4h}, [x1], x4 + ld1 {v3.4h}, [x1], x4 + shll v2.4s, v2.4h, #16 + shll v3.4s, v3.4h, #16 + + fmla v26.4s, v7.4s, v2.4s + fmla v27.4s, v7.4s, v3.4s + ld1 {v0.4h}, [x1], x4 + ld1 {v1.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + fmla v28.4s, v7.4s, v0.4s + fmla v29.4s, v7.4s, v1.4s + ld1 {v2.4h}, [x1], x4 + ld1 {v3.4h}, [x1], x4 + shll v2.4s, v2.4h, #16 + shll v3.4s, v3.4h, #16 + fmla v30.4s, v7.4s, v2.4s + fmla v31.4s, v7.4s, v3.4s + sub x1, x1, x12 + add x1, x1, x7 + + bne L16LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L16LoopH + + sub x3, x3, #16 + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + shrn v20.4h, v20.4s, #16 + shrn v21.4h, v21.4s, #16 + shrn v22.4h, v22.4s, #16 + shrn v23.4h, v23.4s, #16 + shrn v24.4h, v24.4s, #16 + shrn v25.4h, v25.4s, #16 + shrn v26.4h, v26.4s, #16 + shrn v27.4h, v27.4s, #16 + shrn v28.4h, v28.4s, #16 + shrn v29.4h, v29.4s, #16 + shrn v30.4h, v30.4s, #16 + shrn v31.4h, v31.4s, #16 + + add x0, x0, #(16 * 8) + add x1, x13, x12 + cmp x3, #16 + mov x2, x14 + + stp d16, d17, [x0, #-(16 * 8)] + stp d18, d19, [x0, #-(16 * 7)] + stp d20, d21, [x0, #-(16 * 6)] + stp d22, d23, [x0, #-(16 * 5)] + stp d24, d25, [x0, #-(16 * 4)] + stp d26, d27, [x0, #-(16 * 3)] + stp d28, d29, [x0, #-(16 * 2)] + stp d30, d31, [x0, #-(16 * 1)] + + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 16 * sizeof(int16_t) + // add x1, x13, x12 + // cmp x3, #16 + // mov x2, x14 + // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 // 16 * sizeof(int16_t) + // st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], #32 // 16 * sizeof(int16_t) + // st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x0], #32 // 16 * sizeof(int16_t) + // stp + + bge L16Loop + + +L8: +cmp x3, #7 +ble L4 + +mov x12, #8 +mul x12, x4, x12 + +L8Loop: + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 + movi v20.4s, #0 + movi v21.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L8LoopH: + mov x10, x5 + L8LoopW: + ld1 {v3.4h}, [x2], #8 // 4 * sizeof(int16_t) + ld1 {v0.4h}, [x1], x4 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + subs x10, x10, #1 + fmla v16.4s, v3.4s, v0.4s + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + fmla v17.4s, v3.4s, v1.4s + ld1 {v0.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + + fmla v18.4s, v0.4s, v3.4s + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + + fmla v19.4s, v1.4s, v3.4s + ld1 {v0.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + fmla v20.4s, v0.4s, v3.4s + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + fmla v21.4s, v1.4s, v3.4s + ld1 {v0.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + fmla v22.4s, v0.4s, v3.4s + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + fmla v23.4s, v1.4s, v3.4s + + sub x1, x1, x12 + add x1, x1, x7 + + bne L8LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L8LoopH + + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + shrn v20.4h, v20.4s, #16 + shrn v21.4h, v21.4s, #16 + shrn v22.4h, v22.4s, #16 + shrn v23.4h, v23.4s, #16 + + add x0, x0, #(16 * 4) + sub x3, x3, #8 + add x1, x13, x12 + mov x2, x14 + + stp d16, d17, [x0, #-(16 * 4)] + stp d18, d19, [x0, #-(16 * 3)] + stp d20, d21, [x0, #-(16 * 2)] + stp d22, d23, [x0, #-(16 * 1)] + + // sub x3, x3, #8 + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 16 * sizeof(int16_t) + // add x1, x13, x12 + // mov x2, x14 + // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 // 16 * sizeof(int16_t) + + +L4: +cmp x3, #4 +ble L1 + +mov x12, #4 +mul x12, x4, x12 + +L4Loop: + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L4LoopH: + mov x10, x5 + L4LoopW: + ld1 {v3.4h}, [x2], #8 // 4 * sizeof(int16_t) + ld1 {v0.4h}, [x1], x4 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + subs x10, x10, #1 + fmla v16.4s, v3.4s, v0.4s + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + fmla v17.4s, v3.4s, v1.4s + ld1 {v0.4h}, [x1], x4 + shll v0.4s, v0.4h, #16 + fmla v18.4s, v0.4s, v3.4s + ld1 {v1.4h}, [x1], x4 + shll v1.4s, v1.4h, #16 + fmla v19.4s, v1.4s, v3.4s + + sub x1, x1, x12 + add x1, x1, x7 + + bne L4LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L4LoopH + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + + add x0, x0, #(16 * 2) + sub x3, x3, #4 + add x1, x13, x12 + mov x2, x14 + stp d16, d17, [x0, #-(16 * 2)] + stp d18, d19, [x0, #-(16 * 1)] + + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 16 * sizeof(int16_t) + // add x1, x13, x12 + // mov x2, x14 + +L1: +cmp x3, #0 +beq End + +L1Loop: + movi v0.4s, #0 + mov x9, x6 + mov x11, x1 + mov x12, x2 + L1LoopH: + mov x10, x5 + L1LoopW: + ld1 {v1.4h}, [x1], x7 + ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t) + shll v1.4s, v1.4h, #16 + shll v2.4s, v2.4h, #16 + fmla v0.4s, v1.4s, v2.4s + subs x10, x10, #1 + bne L1LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L1LoopH + + shrn v0.4h, v0.4s, #16 + subs x3, x3, #1 + st1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) + mov x2, x12 + add x1, x11, x4 + bne L1Loop + + +End: + +mov x10, v4.d[0] +mov x11, v4.d[1] +mov x0, v5.d[0] +mov x1, v5.d[1] +mov x3, v6.d[0] + +subs x15, x15, #1 +add x0, x0, x11 +add x1, x1, x10 +bne LoopDY + + +ret +//MNNConvRunForLineDepthwise End + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise_BF16.S b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise_BF16.S new file mode 100644 index 00000000..75254f55 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise_BF16.S @@ -0,0 +1,66 @@ +// +// NEON_MNNConvRunForUnitDepthWise_BF16.S +// MNN +// +// Created by MNN on 2021/03/09. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNConvRunForUnitDepthWise_BF16 +//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) + +//Auto: x0:dst, x1:src, x2:weight, x3:fw +//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step + +cmp x3, #0 +movi v0.4s, #0 +beq UnitEnd +cmp x4, #0 +beq UnitEnd + +mov x9, #2 +mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step +mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step +mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step + +//dilate_y_step -> dilate_y_step - dilate_x_step*fw +mul x9, x3, x6 +sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw + +//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw +mov x9, #8 +mul x9, x3, x9 +sub x5, x5, x9 + + +UnitLoopH: +mov x9, x3 +UnitLoopW: +ld1 {v1.4h}, [x1], x6 +ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t) +shll v1.4s, v1.4h, #16 +shll v2.4s, v2.4h, #16 + +fmla v0.4s, v1.4s, v2.4s +subs x9, x9, #1 +bne UnitLoopW +subs x4, x4, #1 +add x1, x1, x7 +add x2, x2, x5 +bne UnitLoopH + + +UnitEnd: +shrn v0.4h, v0.4s, #16 +st1 {v0.4h}, [x0] + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNFloat2Int8.S b/source/backend/cpu/arm/arm64/MNNFloat2Int8.S index 2a87b112..a0e6f527 100644 --- a/source/backend/cpu/arm/arm64/MNNFloat2Int8.S +++ b/source/backend/cpu/arm/arm64/MNNFloat2Int8.S @@ -14,24 +14,35 @@ .align 5 asm_function MNNFloat2Int8 -//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax); -//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax +//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, size_t zeroPoint); +//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint ld1 {v31.4s}, [x3] dup v30.16b, w4 dup v29.16b, w5 +// copy zero point +mov v28.s[0], w6 +mov v28.s[1], w6 +mov v28.s[2], w6 +mov v28.s[3], w6 +scvtf v28.4s, v28.4s + cmp x2, #3 ble FLLoop1 FLLoop4: ld1 {v0.4s, v1.4s}, [x0], #32 fmul v0.4s, v0.4s, v31.4s +fadd v0.4s, v0.4s, v28.4s ld1 {v2.4s, v3.4s}, [x0], #32 fmul v1.4s, v1.4s, v31.4s +fadd v1.4s, v1.4s, v28.4s fmul v2.4s, v2.4s, v31.4s +fadd v2.4s, v2.4s, v28.4s fmul v3.4s, v3.4s, v31.4s +fadd v3.4s, v3.4s, v28.4s fcvtas v0.4s, v0.4s fcvtas v4.4s, v2.4s @@ -62,6 +73,7 @@ beq FLEnd FLLoop1: ld1 {v0.4s}, [x0], #16 fmul v0.4s, v0.4s, v31.4s +fadd v0.4s, v0.4s, v28.4s //st1 {v31.4s}, [x0], #16 fcvtas v0.4s, v0.4s diff --git a/source/backend/cpu/arm/arm64/MNNGemmFloatCommon_4.S b/source/backend/cpu/arm/arm64/MNNGemmFloatCommon_4.S deleted file mode 100644 index f4fcb314..00000000 --- a/source/backend/cpu/arm/arm64/MNNGemmFloatCommon_4.S +++ /dev/null @@ -1,550 +0,0 @@ -// -// MNNGemmFloatCommon_4.S -// MNN -// -// Created by MNN on 2018/03/08. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNGemmFloatCommon_4 -//void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, -// size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) - -//Auto Load: -//x0:dst, x1:src, x2:weight, x3:src_depth_quad, x4:dst_step, x5:dst_depth_quad, x6: width, x7: weight_depth_offset - -sub sp, sp, #128 -st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 -//step multi by sizeof(float) -mov x12, #4 -mul x4, x12, x4 -mul x7, x12, x7 - -//x8: src_z_step -mov x12, #16 -mul x8, x12, x6 - -//x9: weight_z_step -mov x12, #64 -mul x9, x12, x3 -add x9, x7, x9 - -cmp x6, #4 -blt L2 - -L4: -mov x10, x0 -mov x12, x2 -mov x14, x5 -add x15, x7, x3, LSL #6 -add x9, x12, x15 -add x15, x9, x15 - -cmp x5, #3 -blt L4_L4LoopDz - -L4_L12LoopDz: -mov x11, x1 -mov x13, x3 - -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 -ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64 -ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x11], x8 -fmul v16.4s, v0.4s, v12.s[0] -fmul v17.4s, v0.4s, v13.s[0] -fmul v18.4s, v0.4s, v14.s[0] -fmul v19.4s, v0.4s, v15.s[0] -fmul v20.4s, v4.4s, v12.s[0] -fmul v21.4s, v4.4s, v13.s[0] -fmul v22.4s, v4.4s, v14.s[0] -fmul v23.4s, v4.4s, v15.s[0] -fmul v24.4s, v8.4s, v12.s[0] -fmul v25.4s, v8.4s, v13.s[0] -fmul v26.4s, v8.4s, v14.s[0] -fmul v27.4s, v8.4s, v15.s[0] - -subs x13, x13, #1 -beq L4_L12LoopZEnd - -L4_L12LoopZ: - prfm pldl1keep, [x12, #64] - prfm pldl1keep, [x9, #64] - prfm pldl1keep, [x15, #64] - prfm pldl1keep, [x11, x8] - - fmla v16.4s, v1.4s, v12.s[1] - fmla v17.4s, v1.4s, v13.s[1] - fmla v18.4s, v1.4s, v14.s[1] - fmla v19.4s, v1.4s, v15.s[1] - fmla v20.4s, v5.4s, v12.s[1] - fmla v21.4s, v5.4s, v13.s[1] - fmla v22.4s, v5.4s, v14.s[1] - fmla v23.4s, v5.4s, v15.s[1] - fmla v24.4s, v9.4s, v12.s[1] - fmla v25.4s, v9.4s, v13.s[1] - fmla v26.4s, v9.4s, v14.s[1] - fmla v27.4s, v9.4s, v15.s[1] - - fmla v16.4s, v2.4s, v12.s[2] - fmla v17.4s, v2.4s, v13.s[2] - fmla v18.4s, v2.4s, v14.s[2] - fmla v19.4s, v2.4s, v15.s[2] - fmla v20.4s, v6.4s, v12.s[2] - fmla v21.4s, v6.4s, v13.s[2] - fmla v22.4s, v6.4s, v14.s[2] - fmla v23.4s, v6.4s, v15.s[2] - fmla v24.4s, v10.4s, v12.s[2] - fmla v25.4s, v10.4s, v13.s[2] - fmla v26.4s, v10.4s, v14.s[2] - fmla v27.4s, v10.4s, v15.s[2] - - fmla v16.4s, v3.4s, v12.s[3] - fmla v17.4s, v3.4s, v13.s[3] - fmla v18.4s, v3.4s, v14.s[3] - fmla v19.4s, v3.4s, v15.s[3] - fmla v20.4s, v7.4s, v12.s[3] - fmla v21.4s, v7.4s, v13.s[3] - fmla v22.4s, v7.4s, v14.s[3] - fmla v23.4s, v7.4s, v15.s[3] - fmla v24.4s, v11.4s, v12.s[3] - fmla v25.4s, v11.4s, v13.s[3] - fmla v26.4s, v11.4s, v14.s[3] - fmla v27.4s, v11.4s, v15.s[3] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64 - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64 - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x11], x8 - - fmla v16.4s, v0.4s, v12.s[0] - fmla v17.4s, v0.4s, v13.s[0] - fmla v18.4s, v0.4s, v14.s[0] - fmla v19.4s, v0.4s, v15.s[0] - fmla v20.4s, v4.4s, v12.s[0] - fmla v21.4s, v4.4s, v13.s[0] - fmla v22.4s, v4.4s, v14.s[0] - fmla v23.4s, v4.4s, v15.s[0] - fmla v24.4s, v8.4s, v12.s[0] - fmla v25.4s, v8.4s, v13.s[0] - fmla v26.4s, v8.4s, v14.s[0] - fmla v27.4s, v8.4s, v15.s[0] - - subs x13, x13, #1 - bne L4_L12LoopZ - -L4_L12LoopZEnd: - -fmla v16.4s, v1.4s, v12.s[1] -fmla v17.4s, v1.4s, v13.s[1] -fmla v18.4s, v1.4s, v14.s[1] -fmla v19.4s, v1.4s, v15.s[1] -fmla v20.4s, v5.4s, v12.s[1] -fmla v21.4s, v5.4s, v13.s[1] -fmla v22.4s, v5.4s, v14.s[1] -fmla v23.4s, v5.4s, v15.s[1] -fmla v24.4s, v9.4s, v12.s[1] -fmla v25.4s, v9.4s, v13.s[1] -fmla v26.4s, v9.4s, v14.s[1] -fmla v27.4s, v9.4s, v15.s[1] - -fmla v16.4s, v2.4s, v12.s[2] -fmla v17.4s, v2.4s, v13.s[2] -fmla v18.4s, v2.4s, v14.s[2] -fmla v19.4s, v2.4s, v15.s[2] -fmla v20.4s, v6.4s, v12.s[2] -fmla v21.4s, v6.4s, v13.s[2] -fmla v22.4s, v6.4s, v14.s[2] -fmla v23.4s, v6.4s, v15.s[2] -fmla v24.4s, v10.4s, v12.s[2] -fmla v25.4s, v10.4s, v13.s[2] -fmla v26.4s, v10.4s, v14.s[2] -fmla v27.4s, v10.4s, v15.s[2] - -fmla v16.4s, v3.4s, v12.s[3] -fmla v17.4s, v3.4s, v13.s[3] -fmla v18.4s, v3.4s, v14.s[3] -fmla v19.4s, v3.4s, v15.s[3] -fmla v20.4s, v7.4s, v12.s[3] -fmla v21.4s, v7.4s, v13.s[3] -fmla v22.4s, v7.4s, v14.s[3] -fmla v23.4s, v7.4s, v15.s[3] -fmla v24.4s, v11.4s, v12.s[3] -fmla v25.4s, v11.4s, v13.s[3] -fmla v26.4s, v11.4s, v14.s[3] -fmla v27.4s, v11.4s, v15.s[3] - -st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], x4 -st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4 -st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], x4 -add x15, x7, x3, LSL #6 -add x12, x12, x7 -add x12, x12, x15, LSL #1 -add x9, x12, x15 -add x15, x9, x15 -subs x14, x14, #3 -beq L4End -cmp x14, #3 -bge L4_L12LoopDz - -L4_L4LoopDz: -mov x11, x1 -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 -ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x11], x8 -fmul v8.4s, v0.4s, v4.s[0] -fmul v9.4s, v0.4s, v5.s[0] -fmul v10.4s, v0.4s, v6.s[0] -fmul v11.4s, v0.4s, v7.s[0] -fmul v12.4s, v1.4s, v4.s[1] -fmul v13.4s, v1.4s, v5.s[1] -fmul v14.4s, v1.4s, v6.s[1] -fmul v15.4s, v1.4s, v7.s[1] -fmul v16.4s, v2.4s, v4.s[2] -fmul v17.4s, v2.4s, v5.s[2] -fmul v18.4s, v2.4s, v6.s[2] -fmul v19.4s, v2.4s, v7.s[2] -fmul v20.4s, v3.4s, v4.s[3] -fmul v21.4s, v3.4s, v5.s[3] -fmul v22.4s, v3.4s, v6.s[3] -fmul v23.4s, v3.4s, v7.s[3] -subs x13, x3, #1 -beq L4_L4LoopZEnd - -L4_L4LoopZ: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x11], x8 - fmla v8.4s, v0.4s, v4.s[0] - fmla v9.4s, v0.4s, v5.s[0] - fmla v10.4s, v0.4s, v6.s[0] - fmla v11.4s, v0.4s, v7.s[0] - fmla v12.4s, v1.4s, v4.s[1] - fmla v13.4s, v1.4s, v5.s[1] - fmla v14.4s, v1.4s, v6.s[1] - fmla v15.4s, v1.4s, v7.s[1] - fmla v16.4s, v2.4s, v4.s[2] - fmla v17.4s, v2.4s, v5.s[2] - fmla v18.4s, v2.4s, v6.s[2] - fmla v19.4s, v2.4s, v7.s[2] - fmla v20.4s, v3.4s, v4.s[3] - fmla v21.4s, v3.4s, v5.s[3] - fmla v22.4s, v3.4s, v6.s[3] - fmla v23.4s, v3.4s, v7.s[3] - subs x13, x13, #1 - bne L4_L4LoopZ - -L4_L4LoopZEnd: -fadd v8.4s, v8.4s, v12.4s -fadd v9.4s, v9.4s, v13.4s -fadd v10.4s, v10.4s, v14.4s -fadd v11.4s, v11.4s, v15.4s -fadd v16.4s, v16.4s, v20.4s -fadd v17.4s, v17.4s, v21.4s -fadd v18.4s, v18.4s, v22.4s -fadd v19.4s, v19.4s, v23.4s -fadd v8.4s, v8.4s, v16.4s -fadd v9.4s, v9.4s, v17.4s -fadd v10.4s, v10.4s, v18.4s -fadd v11.4s, v11.4s, v19.4s -st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4 -add x12, x12, x7 -subs x14, x14, #1 -bne L4_L4LoopDz - -L4End: -add x0, x0, #64 -add x1, x1, #64 -sub x6, x6, #4 -cmp x6, #4 -bge L4 - -L2: -cmp x6, #2 -blt L1 -sub x6, x6, #2 -mov x10, x0 -mov x12, x2 -mov x14, x5 -cmp x5, #3 -blt L2_L2LoopDz -add x15, x7, x3, LSL #6 -add x9, x12, x15 -add x15, x9, x15 - -L2_L12LoopDz: -mov x11, x1 -mov x13, x3 - -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 -ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64 -ld1 {v12.4s, v13.4s}, [x11], x8 -fmul v14.4s, v0.4s, v12.s[0] -fmul v15.4s, v0.4s, v13.s[0] -fmul v20.4s, v1.4s, v12.s[1] -fmul v21.4s, v1.4s, v13.s[1] -fmul v16.4s, v4.4s, v12.s[0] -fmul v17.4s, v4.4s, v13.s[0] -fmul v22.4s, v5.4s, v12.s[1] -fmul v23.4s, v5.4s, v13.s[1] -lsl x8, x8, #2 -fmul v18.4s, v8.4s, v12.s[0] -fmul v19.4s, v8.4s, v13.s[0] -fmul v24.4s, v9.4s, v12.s[1] -fmul v25.4s, v9.4s, v13.s[1] -subs x13, x13, #1 -beq L2_L12LoopZEnd - -L2_L12LoopZ: - prfm pldl1keep, [x12, #256] - prfm pldl1keep, [x9, #256] - prfm pldl1keep, [x15, #256] - prfm pldl1keep, [x11, x8] - - fmla v14.4s, v2.4s, v12.s[2] - fmla v15.4s, v2.4s, v13.s[2] - fmla v20.4s, v3.4s, v12.s[3] - fmla v21.4s, v3.4s, v13.s[3] - fmla v16.4s, v6.4s, v12.s[2] - fmla v17.4s, v6.4s, v13.s[2] - fmla v22.4s, v7.4s, v12.s[3] - fmla v23.4s, v7.4s, v13.s[3] - lsr x8, x8, #2 - fmla v18.4s, v10.4s, v12.s[2] - fmla v19.4s, v10.4s, v13.s[2] - fmla v24.4s, v11.4s, v12.s[3] - fmla v25.4s, v11.4s, v13.s[3] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64 - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64 - ld1 {v12.4s, v13.4s}, [x11], x8 - fmla v14.4s, v0.4s, v12.s[0] - fmla v15.4s, v0.4s, v13.s[0] - fmla v16.4s, v4.4s, v12.s[0] - fmla v17.4s, v4.4s, v13.s[0] - fmla v18.4s, v8.4s, v12.s[0] - fmla v19.4s, v8.4s, v13.s[0] - fmla v20.4s, v1.4s, v12.s[1] - fmla v21.4s, v1.4s, v13.s[1] - lsl x8, x8, #2 - fmla v22.4s, v5.4s, v12.s[1] - fmla v23.4s, v5.4s, v13.s[1] - fmla v24.4s, v9.4s, v12.s[1] - fmla v25.4s, v9.4s, v13.s[1] - - subs x13, x13, #1 - bne L2_L12LoopZ - -L2_L12LoopZEnd: -fmla v14.4s, v2.4s, v12.s[2] -fmla v15.4s, v2.4s, v13.s[2] -fmla v16.4s, v6.4s, v12.s[2] -fmla v17.4s, v6.4s, v13.s[2] -fmla v18.4s, v10.4s, v12.s[2] -fmla v19.4s, v10.4s, v13.s[2] -fmla v20.4s, v3.4s, v12.s[3] -fmla v21.4s, v3.4s, v13.s[3] -lsr x8, x8, #2 -fmla v22.4s, v7.4s, v12.s[3] -fmla v23.4s, v7.4s, v13.s[3] -fmla v24.4s, v11.4s, v12.s[3] -fmla v25.4s, v11.4s, v13.s[3] -fadd v14.4s, v14.4s, v20.4s -fadd v15.4s, v15.4s, v21.4s -fadd v16.4s, v16.4s, v22.4s -fadd v17.4s, v17.4s, v23.4s -fadd v18.4s, v18.4s, v24.4s -fadd v19.4s, v19.4s, v25.4s -st1 {v14.4s, v15.4s}, [x10], x4 -st1 {v16.4s, v17.4s}, [x10], x4 -st1 {v18.4s, v19.4s}, [x10], x4 -add x15, x7, x3, LSL #6 -add x12, x12, x7 -add x12, x12, x15, LSL #1 -add x9, x12, x15 -add x15, x9, x15 -subs x14, x14, #3 -beq L2End -cmp x14, #3 -bge L2_L12LoopDz - -L2_L2LoopDz: -mov x11, x1 -subs x13, x3, #1 -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 -ld1 {v4.4s, v5.4s}, [x11], x8 -fmul v6.4s, v0.4s, v4.s[0] -fmul v7.4s, v0.4s, v5.s[0] -fmul v8.4s, v1.4s, v4.s[1] -fmul v9.4s, v1.4s, v5.s[1] -fmul v10.4s, v2.4s, v4.s[2] -fmul v11.4s, v2.4s, v5.s[2] -fmul v12.4s, v3.4s, v4.s[3] -fmul v13.4s, v3.4s, v5.s[3] -beq L2_L2LoopZEnd - -L2_L2LoopZ: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 - ld1 {v4.4s, v5.4s}, [x11], x8 - fmla v6.4s, v0.4s, v4.s[0] - fmla v7.4s, v0.4s, v5.s[0] - fmla v8.4s, v1.4s, v4.s[1] - fmla v9.4s, v1.4s, v5.s[1] - fmla v10.4s, v2.4s, v4.s[2] - fmla v11.4s, v2.4s, v5.s[2] - fmla v12.4s, v3.4s, v4.s[3] - fmla v13.4s, v3.4s, v5.s[3] - subs x13, x13, #1 - bne L2_L2LoopZ - -L2_L2LoopZEnd: -fadd v6.4s, v6.4s, v8.4s -fadd v7.4s, v7.4s, v9.4s -fadd v10.4s, v10.4s, v12.4s -fadd v11.4s, v11.4s, v13.4s -fadd v6.4s, v6.4s, v10.4s -fadd v7.4s, v7.4s, v11.4s -st1 {v6.4s, v7.4s}, [x10], x4 -add x12, x12, x7 -subs x14, x14, #1 -bne L2_L2LoopDz - -L2End: -add x0, x0, #32 -add x1, x1, #32 - -L1: -lsl x15, x8, #1 -#lsl x15, x8, #2 -cmp x6, #1 -blt End -mov x10, x0 -mov x12, x2 -mov x14, x5 -cmp x5, #3 -blt L1_L1LoopDz -add x15, x7, x3, LSL #6 -add x9, x12, x15 -add x15, x9, x15 - -L1_L12LoopDz: -mov x11, x1 -mov x13, x3 - -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 -ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64 -ld1 {v12.4s}, [x11], x8 -fmul v13.4s, v0.4s, v12.s[0] -fmul v14.4s, v4.4s, v12.s[0] -fmul v15.4s, v8.4s, v12.s[0] -fmul v16.4s, v1.4s, v12.s[1] -fmul v17.4s, v5.4s, v12.s[1] -fmul v18.4s, v9.4s, v12.s[1] -fmul v19.4s, v2.4s, v12.s[2] -fmul v20.4s, v6.4s, v12.s[2] -lsl x8, x8, #2 -fmul v21.4s, v10.4s, v12.s[2] -fmul v22.4s, v3.4s, v12.s[3] -fmul v23.4s, v7.4s, v12.s[3] -fmul v24.4s, v11.4s, v12.s[3] -subs x13, x13, #1 -beq L1_L12LoopZEnd - -L1_L12LoopZ: - prfm pldl1keep, [x12, #256] - prfm pldl1keep, [x9, #256] - prfm pldl1keep, [x15, #256] - prfm pldl1keep, [x11, x8] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x9], #64 - lsr x8, x8, #2 - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x15], #64 - ld1 {v12.4s}, [x11], x8 - fmla v13.4s, v0.4s, v12.s[0] - fmla v14.4s, v4.4s, v12.s[0] - fmla v15.4s, v8.4s, v12.s[0] - fmla v16.4s, v1.4s, v12.s[1] - fmla v17.4s, v5.4s, v12.s[1] - fmla v18.4s, v9.4s, v12.s[1] - fmla v19.4s, v2.4s, v12.s[2] - fmla v20.4s, v6.4s, v12.s[2] - lsl x8, x8, #2 - fmla v21.4s, v10.4s, v12.s[2] - fmla v22.4s, v3.4s, v12.s[3] - fmla v23.4s, v7.4s, v12.s[3] - fmla v24.4s, v11.4s, v12.s[3] - subs x13, x13, #1 - bne L1_L12LoopZ - -L1_L12LoopZEnd: -fadd v13.4s, v13.4s, v16.4s -fadd v14.4s, v14.4s, v17.4s -fadd v15.4s, v15.4s, v18.4s -fadd v19.4s, v19.4s, v22.4s -lsr x8, x8, #2 -fadd v20.4s, v20.4s, v23.4s -fadd v21.4s, v21.4s, v24.4s -fadd v13.4s, v13.4s, v19.4s -fadd v14.4s, v14.4s, v20.4s -fadd v15.4s, v15.4s, v21.4s -st1 {v13.4s}, [x10], x4 -st1 {v14.4s}, [x10], x4 -st1 {v15.4s}, [x10], x4 -add x15, x7, x3, LSL #6 -add x12, x12, x7 -add x12, x12, x15, LSL #1 -add x9, x12, x15 -add x15, x9, x15 -subs x14, x14, #3 -beq End -cmp x14, #3 -bge L1_L12LoopDz - -L1_L1LoopDz: -mov x11, x1 -subs x13, x3, #1 -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 -ld1 {v4.4s}, [x11], x8 -fmul v5.4s, v0.4s, v4.s[0] -fmul v6.4s, v1.4s, v4.s[1] -fmul v7.4s, v2.4s, v4.s[2] -fmul v8.4s, v3.4s, v4.s[3] -beq L1_L1LoopZEnd - -L1_L1LoopZ: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64 - ld1 {v4.4s}, [x11], x8 - fmla v5.4s, v0.4s, v4.s[0] - fmla v6.4s, v1.4s, v4.s[1] - fmla v7.4s, v2.4s, v4.s[2] - fmla v8.4s, v3.4s, v4.s[3] - subs x13, x13, #1 - bne L1_L1LoopZ - -L1_L1LoopZEnd: -fadd v5.4s, v5.4s, v6.4s -fadd v7.4s, v7.4s, v8.4s -fadd v5.4s, v5.4s, v7.4s -st1 {v5.4s}, [x10], x4 -add x12, x12, x7 -subs x14, x14, #1 -bne L1_L1LoopDz - -End: - -sub sp, sp, #128 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 -ret - -#endif diff --git a/source/backend/cpu/arm/arm64/MNNGemmFloatOne_4.S b/source/backend/cpu/arm/arm64/MNNGemmFloatOne_4.S deleted file mode 100644 index dbd0cd87..00000000 --- a/source/backend/cpu/arm/arm64/MNNGemmFloatOne_4.S +++ /dev/null @@ -1,151 +0,0 @@ -// -// MNNGemmFloatOne_4.S -// MNN -// -// Created by MNN on 2019/02/14. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNGemmFloatOne_4 -//void MNNGemmFloatOne_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, -// size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset) - -//Auto Load: -//x0:dst, x1:src, x2:weight, x3: src_depth_quad -//x4:dst_step, x5:dst_depth_quad, x6:weight_depth_offset - -sub sp, sp, #128 -st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 -//step multi by sizeof(float) -mov x12, #4 -mul x4, x12, x4 -mul x6, x12, x6 - -mov x12, #64 //16*sizeof(float) -mul x9, x12, x3 -add x9, x6, x9 - -cmp x5, #3 -blt L1_L1LoopDz - -add x7, x2, x9 -add x8, x2, x9, LSL #1 - -L1_L12LoopDz: -mov x11, x1 -mov x13, x3 - -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 -ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x7], #64 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64 -ld1 {v12.4s}, [x11], #16 -fmul v13.4s, v0.4s, v12.s[0] -fmul v14.4s, v4.4s, v12.s[0] -fmul v15.4s, v8.4s, v12.s[0] -fmul v16.4s, v1.4s, v12.s[1] -fmul v17.4s, v5.4s, v12.s[1] -fmul v18.4s, v9.4s, v12.s[1] -fmul v19.4s, v2.4s, v12.s[2] -fmul v20.4s, v6.4s, v12.s[2] -fmul v21.4s, v10.4s, v12.s[2] -fmul v22.4s, v3.4s, v12.s[3] -fmul v23.4s, v7.4s, v12.s[3] -fmul v24.4s, v11.4s, v12.s[3] -subs x13, x13, #1 -beq L1_L12LoopZEnd - -L1_L12LoopZ: - prfm pldl1keep, [x2, #256] - prfm pldl1keep, [x7, #256] - prfm pldl1keep, [x8, #256] - prfm pldl1keep, [x11, #128] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x7], #64 - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64 - ld1 {v12.4s}, [x11], #16 - fmla v13.4s, v0.4s, v12.s[0] - fmla v14.4s, v4.4s, v12.s[0] - fmla v15.4s, v8.4s, v12.s[0] - fmla v16.4s, v1.4s, v12.s[1] - fmla v17.4s, v5.4s, v12.s[1] - fmla v18.4s, v9.4s, v12.s[1] - fmla v19.4s, v2.4s, v12.s[2] - fmla v20.4s, v6.4s, v12.s[2] - fmla v21.4s, v10.4s, v12.s[2] - fmla v22.4s, v3.4s, v12.s[3] - fmla v23.4s, v7.4s, v12.s[3] - fmla v24.4s, v11.4s, v12.s[3] - subs x13, x13, #1 - bne L1_L12LoopZ - -L1_L12LoopZEnd: -fadd v13.4s, v13.4s, v16.4s -fadd v14.4s, v14.4s, v17.4s -fadd v15.4s, v15.4s, v18.4s -fadd v19.4s, v19.4s, v22.4s -fadd v20.4s, v20.4s, v23.4s -fadd v21.4s, v21.4s, v24.4s -fadd v13.4s, v13.4s, v19.4s -fadd v14.4s, v14.4s, v20.4s -fadd v15.4s, v15.4s, v21.4s -st1 {v13.4s}, [x0], x4 -st1 {v14.4s}, [x0], x4 -st1 {v15.4s}, [x0], x4 -add x2, x2, x6 -add x7, x7, x6 -add x8, x8, x6 -add x2, x2, x9, LSL #1 -add x7, x7, x9, LSL #1 -add x8, x8, x9, LSL #1 -subs x5, x5, #3 -beq End -cmp x5, #3 -bge L1_L12LoopDz - -L1_L1LoopDz: -mov x11, x1 -subs x13, x3, #1 -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 -ld1 {v4.4s}, [x11], #16 -fmul v5.4s, v0.4s, v4.s[0] -fmul v6.4s, v1.4s, v4.s[1] -fmul v7.4s, v2.4s, v4.s[2] -fmul v8.4s, v3.4s, v4.s[3] -beq L1_L1LoopZEnd - -L1_L1LoopZ: - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 - ld1 {v4.4s}, [x11], #16 - fmla v5.4s, v0.4s, v4.s[0] - fmla v6.4s, v1.4s, v4.s[1] - fmla v7.4s, v2.4s, v4.s[2] - fmla v8.4s, v3.4s, v4.s[3] - subs x13, x13, #1 - bne L1_L1LoopZ - -L1_L1LoopZEnd: -fadd v5.4s, v5.4s, v6.4s -fadd v7.4s, v7.4s, v8.4s -fadd v5.4s, v5.4s, v7.4s -st1 {v5.4s}, [x0], x4 -add x2, x2, x6 -subs x5, x5, #1 -bne L1_L1LoopDz - -End: - -sub sp, sp, #128 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 -ret - -#endif diff --git a/source/backend/cpu/arm/arm64/MNNGemmFloatUnit_4.S b/source/backend/cpu/arm/arm64/MNNGemmFloatUnit_4.S deleted file mode 100644 index c3723fe3..00000000 --- a/source/backend/cpu/arm/arm64/MNNGemmFloatUnit_4.S +++ /dev/null @@ -1,282 +0,0 @@ -// -// MNNGemmFloatUnit_4.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNGemmFloatUnit_4 -//void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset) - -//Auto -//x0: dst, x1:src, x2:weight, x3:src_depth_quad - -//x4:dst_step, x5:dst_depth_quad, x6: weight_depth_offset - -mov x12, #4 //sizeof(float) -mul x4, x12, x4 -mul x6, x12, x6 -add x11, x6, x3, LSL #6 - -sub sp, sp, #128 -st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 - -cmp x5, #2 -blt LoopDzExtra - -LoopDz: -mov x8, x1 -subs x9, x3, #1 - -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2] -add x2, x2, x11 -ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64 -fmul v16.4s, v8.4s, v0.s[0] -fmul v17.4s, v8.4s, v1.s[0] -ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64 -fmul v18.4s, v8.4s, v2.s[0] -fmul v19.4s, v8.4s, v3.s[0] -ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64 -sub x2, x2, x11 -fmul v20.4s, v8.4s, v4.s[0] -fmul v21.4s, v8.4s, v5.s[0] -fmul v22.4s, v8.4s, v6.s[0] -fmul v23.4s, v8.4s, v7.s[0] -fmul v24.4s, v12.4s, v0.s[0] -fmul v25.4s, v12.4s, v1.s[0] -fmul v26.4s, v12.4s, v2.s[0] -fmul v27.4s, v12.4s, v3.s[0] -fmul v28.4s, v12.4s, v4.s[0] -fmul v29.4s, v12.4s, v5.s[0] -fmul v30.4s, v12.4s, v6.s[0] -fmul v31.4s, v12.4s, v7.s[0] - -beq L8LoopZEnd -L8LoopZ: - add x2, x2, #128 - prfm pldl1keep, [x2] - prfm pldl1keep, [x2, x11] - sub x2, x2, #128 - prfm pldl1keep, [x8, #128] - prfm pldl1keep, [x8, #192] - - fmla v16.4s, v9.4s, v0.s[1] - fmla v17.4s, v9.4s, v1.s[1] - fmla v18.4s, v9.4s, v2.s[1] - fmla v19.4s, v9.4s, v3.s[1] - fmla v20.4s, v9.4s, v4.s[1] - fmla v21.4s, v9.4s, v5.s[1] - fmla v22.4s, v9.4s, v6.s[1] - fmla v23.4s, v9.4s, v7.s[1] - fmla v24.4s, v13.4s, v0.s[1] - fmla v25.4s, v13.4s, v1.s[1] - fmla v26.4s, v13.4s, v2.s[1] - fmla v27.4s, v13.4s, v3.s[1] - fmla v28.4s, v13.4s, v4.s[1] - fmla v29.4s, v13.4s, v5.s[1] - fmla v30.4s, v13.4s, v6.s[1] - fmla v31.4s, v13.4s, v7.s[1] - - fmla v16.4s, v10.4s, v0.s[2] - fmla v17.4s, v10.4s, v1.s[2] - fmla v18.4s, v10.4s, v2.s[2] - fmla v19.4s, v10.4s, v3.s[2] - fmla v20.4s, v10.4s, v4.s[2] - fmla v21.4s, v10.4s, v5.s[2] - fmla v22.4s, v10.4s, v6.s[2] - fmla v23.4s, v10.4s, v7.s[2] - fmla v24.4s, v14.4s, v0.s[2] - fmla v25.4s, v14.4s, v1.s[2] - fmla v26.4s, v14.4s, v2.s[2] - fmla v27.4s, v14.4s, v3.s[2] - fmla v28.4s, v14.4s, v4.s[2] - fmla v29.4s, v14.4s, v5.s[2] - fmla v30.4s, v14.4s, v6.s[2] - fmla v31.4s, v14.4s, v7.s[2] - - fmla v16.4s, v11.4s, v0.s[3] - fmla v17.4s, v11.4s, v1.s[3] - fmla v18.4s, v11.4s, v2.s[3] - fmla v19.4s, v11.4s, v3.s[3] - fmla v20.4s, v11.4s, v4.s[3] - fmla v21.4s, v11.4s, v5.s[3] - fmla v22.4s, v11.4s, v6.s[3] - fmla v23.4s, v11.4s, v7.s[3] - fmla v24.4s, v15.4s, v0.s[3] - fmla v25.4s, v15.4s, v1.s[3] - fmla v26.4s, v15.4s, v2.s[3] - fmla v27.4s, v15.4s, v3.s[3] - fmla v28.4s, v15.4s, v4.s[3] - fmla v29.4s, v15.4s, v5.s[3] - fmla v30.4s, v15.4s, v6.s[3] - fmla v31.4s, v15.4s, v7.s[3] - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2] - add x2, x2, x11 - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64 - fmla v16.4s, v8.4s, v0.s[0] - fmla v17.4s, v8.4s, v1.s[0] - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64 - fmla v18.4s, v8.4s, v2.s[0] - fmla v19.4s, v8.4s, v3.s[0] - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64 - sub x2, x2, x11 - fmla v20.4s, v8.4s, v4.s[0] - fmla v21.4s, v8.4s, v5.s[0] - fmla v22.4s, v8.4s, v6.s[0] - fmla v23.4s, v8.4s, v7.s[0] - fmla v24.4s, v12.4s, v0.s[0] - fmla v25.4s, v12.4s, v1.s[0] - fmla v26.4s, v12.4s, v2.s[0] - fmla v27.4s, v12.4s, v3.s[0] - fmla v28.4s, v12.4s, v4.s[0] - fmla v29.4s, v12.4s, v5.s[0] - fmla v30.4s, v12.4s, v6.s[0] - fmla v31.4s, v12.4s, v7.s[0] - - subs x9, x9, #1 - bne L8LoopZ - -L8LoopZEnd: -fmla v16.4s, v9.4s, v0.s[1] -fmla v17.4s, v9.4s, v1.s[1] -fmla v18.4s, v9.4s, v2.s[1] -fmla v19.4s, v9.4s, v3.s[1] -fmla v20.4s, v9.4s, v4.s[1] -fmla v21.4s, v9.4s, v5.s[1] -fmla v22.4s, v9.4s, v6.s[1] -fmla v23.4s, v9.4s, v7.s[1] -fmla v24.4s, v13.4s, v0.s[1] -fmla v25.4s, v13.4s, v1.s[1] -fmla v26.4s, v13.4s, v2.s[1] -fmla v27.4s, v13.4s, v3.s[1] -fmla v28.4s, v13.4s, v4.s[1] -fmla v29.4s, v13.4s, v5.s[1] -fmla v30.4s, v13.4s, v6.s[1] -fmla v31.4s, v13.4s, v7.s[1] - -fmla v16.4s, v10.4s, v0.s[2] -fmla v17.4s, v10.4s, v1.s[2] -fmla v18.4s, v10.4s, v2.s[2] -fmla v19.4s, v10.4s, v3.s[2] -fmla v20.4s, v10.4s, v4.s[2] -fmla v21.4s, v10.4s, v5.s[2] -fmla v22.4s, v10.4s, v6.s[2] -fmla v23.4s, v10.4s, v7.s[2] -fmla v24.4s, v14.4s, v0.s[2] -fmla v25.4s, v14.4s, v1.s[2] -fmla v26.4s, v14.4s, v2.s[2] -fmla v27.4s, v14.4s, v3.s[2] -fmla v28.4s, v14.4s, v4.s[2] -fmla v29.4s, v14.4s, v5.s[2] -fmla v30.4s, v14.4s, v6.s[2] -fmla v31.4s, v14.4s, v7.s[2] - -mov x12, x0 - -fmla v16.4s, v11.4s, v0.s[3] -fmla v17.4s, v11.4s, v1.s[3] -fmla v18.4s, v11.4s, v2.s[3] -fmla v19.4s, v11.4s, v3.s[3] -fmla v20.4s, v11.4s, v4.s[3] -fmla v21.4s, v11.4s, v5.s[3] -fmla v22.4s, v11.4s, v6.s[3] -fmla v23.4s, v11.4s, v7.s[3] -fmla v24.4s, v15.4s, v0.s[3] -fmla v25.4s, v15.4s, v1.s[3] -fmla v26.4s, v15.4s, v2.s[3] -fmla v27.4s, v15.4s, v3.s[3] -fmla v28.4s, v15.4s, v4.s[3] -st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 -fmla v29.4s, v15.4s, v5.s[3] -st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 -fmla v30.4s, v15.4s, v6.s[3] -add x0, x12, x4 -st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 -add x2, x2, x11 -fmla v31.4s, v15.4s, v7.s[3] -add x2, x2, x6 -sub x5, x5, #2 -st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64 -add x0, x12, x4, LSL #1 - -cmp x5, #1 -blt LoopDzEnd -bgt LoopDz - -LoopDzExtra: - -mov w11, #0 -mov x8, x1 -mov x9, x3 -dup v16.4s, w11 -dup v17.4s, w11 -dup v18.4s, w11 -dup v19.4s, w11 -dup v20.4s, w11 -dup v21.4s, w11 -dup v22.4s, w11 -dup v23.4s, w11 - -L4LoopZ: - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64 - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64 - fmla v16.4s, v8.4s, v0.s[0] - fmla v17.4s, v8.4s, v1.s[0] - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64 - fmla v18.4s, v8.4s, v2.s[0] - fmla v19.4s, v8.4s, v3.s[0] - fmla v20.4s, v8.4s, v4.s[0] - fmla v21.4s, v8.4s, v5.s[0] - fmla v22.4s, v8.4s, v6.s[0] - fmla v23.4s, v8.4s, v7.s[0] - - fmla v16.4s, v9.4s, v0.s[1] - fmla v17.4s, v9.4s, v1.s[1] - fmla v18.4s, v9.4s, v2.s[1] - fmla v19.4s, v9.4s, v3.s[1] - fmla v20.4s, v9.4s, v4.s[1] - fmla v21.4s, v9.4s, v5.s[1] - fmla v22.4s, v9.4s, v6.s[1] - fmla v23.4s, v9.4s, v7.s[1] - - fmla v16.4s, v10.4s, v0.s[2] - fmla v17.4s, v10.4s, v1.s[2] - fmla v18.4s, v10.4s, v2.s[2] - fmla v19.4s, v10.4s, v3.s[2] - fmla v20.4s, v10.4s, v4.s[2] - fmla v21.4s, v10.4s, v5.s[2] - fmla v22.4s, v10.4s, v6.s[2] - fmla v23.4s, v10.4s, v7.s[2] - - fmla v16.4s, v11.4s, v0.s[3] - fmla v17.4s, v11.4s, v1.s[3] - fmla v18.4s, v11.4s, v2.s[3] - fmla v19.4s, v11.4s, v3.s[3] - fmla v20.4s, v11.4s, v4.s[3] - fmla v21.4s, v11.4s, v5.s[3] - fmla v22.4s, v11.4s, v6.s[3] - fmla v23.4s, v11.4s, v7.s[3] - - subs x9, x9, #1 - bne L4LoopZ - -st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 -st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 - -LoopDzEnd: -sub sp, sp, #128 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 - -ret -#endif diff --git a/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S b/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S index eb84ec26..15f77852 100644 --- a/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S +++ b/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S @@ -16,10 +16,17 @@ asm_function MNNInt8ScaleToFloat // void MNNInt8ScaleToFloat(float* dst, -// const int8_t* src, const float* scale, size_t size) +// const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint) // Auto Load: -// x0: dst*, x1: src*, x2: scale*, x3: size +// x0: dst*, x1: src*, x2: scale*, x3: size, x4: zeroPoint + +// copy zero point +mov v28.s[0], w4 +mov v28.s[1], w4 +mov v28.s[2], w4 +mov v28.s[3], w4 +scvtf v28.4s, v28.4s cmp x3, #0 beq End @@ -43,11 +50,15 @@ L4Loop: scvtf v4.4s, v0.4s scvtf v5.4s, v1.4s scvtf v6.4s, v2.4s + fsub v4.4s, v4.4s, v28.4s + fsub v5.4s, v5.4s, v28.4s fmul v0.4s, v4.4s, v16.4s fmul v1.4s, v5.4s, v16.4s scvtf v7.4s, v3.4s + fsub v6.4s, v6.4s, v28.4s fmul v2.4s, v6.4s, v16.4s st1 {v0.4s, v1.4s}, [x0], #32 + fsub v7.4s, v7.4s, v28.4s fmul v3.4s, v7.4s, v16.4s cmp x3, #4 st1 {v2.4s, v3.4s}, [x0], #32 @@ -62,6 +73,7 @@ L1Loop: sxtl v0.8h, v17.8b sxtl v1.4s, v0.4h scvtf v2.4s, v1.4s + fsub v2.4s, v2.4s, v28.4s fmul v1.4s, v2.4s, v16.4s st1 {v1.4s}, [x0], #16 diff --git a/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S index 7beeab48..2d147185 100644 --- a/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S +++ b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A.S @@ -6,30 +6,14 @@ // Copyright © 2018, Alibaba Group Holding Limited // #ifdef __aarch64__ - #include "MNNAsmGlobal.h" -.text -.align 5 -asm_function MNNPackC4ForMatMul_A -//void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) -//Auto: x0: dest, x1:source, x2: e, x3:l, x4: eReal -// eReal -> eReal * 4 * sizeof(float) - 192 -mov x13, #4 -mov x12, #16 -mul x4, x12, x4 -mul x8, x13, x2 - -sub x4, x4, #192 - -// Set x13 as l * 12 * sizeof(float) -mov x12, #48 -mul x13, x3, x12 - -Body: -cmp x2, #12 -blt Right +// [x0, x1, x2, x3] => [x0, x6, x2, x3] =mov=> [x0, x1, x2, x3] .macro transpose_4x4 x0, x1, x2, x3, x5, x6 +// x0: [00,01,02,03] \ x5:[00,10,02,12] \ x0:[00,10,20,30] +// x1: [10,11,12,13] ===\ x1:[01,11,03,13] ===\ x6:[01,11,21,31] +// x2: [20,21,22,23] ===/ x6:[20,30,22,32] ===/ x2:[02,12,22,32] +// x3: [30,31,32,33] / x3:[21,31,23,33] / x3:[03,13,23,33] trn1 \x5\().4s, \x0\().4s, \x1\().4s trn2 \x1\().4s, \x0\().4s, \x1\().4s trn1 \x6\().4s, \x2\().4s, \x3\().4s @@ -41,139 +25,219 @@ blt Right mov \x1\().16b, \x6\().16b .endm -LoopE12: - mov x6, x0 - mov x7, x1 - mov x5, x3 - cmp x5, #4 +.text +.align 5 +asm_function MNNPackC4ForMatMul_A +//void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) +//Auto: x0: dest, x1:sourceGroup, x2: info, x3:el +ldr w10, [x2, #0] // number +mov x4, #0 +mov x11, #0 +mov x6, #0 +ldr w4, [x2, #4] // eReal +ldr w11, [x2, #8] // eDest +ldr w6, [x2, #12] // xOffset +// xOffset -> xOffset * 4 * sizeof(float) +// eReal -> eReal * 4 * sizeof(float) +// eDest -> eDest * sizeof(float) +mov x12, #4 // sizeof(float). kept as a const +mov x9, #16 +mul x4, x9, x4 +mul x11, x12, x11 +mul x6, x9, x6 + +LoopNumber: +mov x8, #0 +mov x7, #0 +ldr w5, [x3, #4] // l +ldr w8, [x3, #8] // eOffset +ldr w7, [x3, #12] // lOffset + +mov x13, x0 +mov x14, x1 +ldr x1, [x1, #0] + +// Compute dest ptr: x0 = x0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float) +mul x7, x11, x7 +mul x8, x12, x8 +add x0, x0, x7 +add x0, x0, x8 + +ldr w2, [x3, #0] // e + +Body: +cmp w2, #12 +blt Right + cmp w5, #4 blt LoopEL3 LoopL4: + mov x2, x1 .macro MAIN_TRANSPOSE - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + ld1 {v0.4s}, [x1], x6 + ld1 {v3.4s}, [x1], x6 + ld1 {v6.4s}, [x1], x6 + ld1 {v17.4s}, [x1], x6 + ld1 {v1.4s}, [x1], x6 + ld1 {v4.4s}, [x1], x6 + ld1 {v7.4s}, [x1], x6 + ld1 {v18.4s}, [x1], x6 + ld1 {v2.4s}, [x1], x6 + ld1 {v5.4s}, [x1], x6 + ld1 {v16.4s}, [x1], x6 + ld1 {v19.4s}, [x1], x6 - ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 - - transpose_4x4 v0, v1, v2, v3, v21, v22 - transpose_4x4 v4, v5, v6, v7, v23, v24 - transpose_4x4 v16, v17, v18, v19, v25, v26 + transpose_4x4 v0, v3, v6, v17, v23, v24 + transpose_4x4 v1, v4, v7, v18, v25, v26 + transpose_4x4 v2, v5, v16, v19, v27, v28 .endm MAIN_TRANSPOSE - stp q0, q4, [x0], #32 - stp q16, q1, [x0], #32 - stp q5, q17, [x0], #32 - stp q2, q6, [x0], #32 - stp q18, q3, [x0], #32 - stp q7, q19, [x0], #32 - add x1, x1, x4 + stp q0, q1, [x0] + stp q2, q3, [x0, #(32 * 1)] + stp q4, q5, [x0, #(32 * 2)] + stp q6, q7, [x0, #(32 * 3)] + stp q16, q17, [x0, #(32 * 4)] + stp q18, q19, [x0, #(32 * 5)] + add x0, x0, #(32 * 6) + + // st1 {v0.4s}, [x0], #16 + // st1 {v4.4s}, [x0], #16 + // st1 {v16.4s}, [x0], #16 + // st1 {v1.4s}, [x0], #16 + // st1 {v5.4s}, [x0], #16 + // st1 {v17.4s}, [x0], #16 + // st1 {v2.4s}, [x0], #16 + // st1 {v6.4s}, [x0], #16 + // st1 {v18.4s}, [x0], #16 + // st1 {v3.4s}, [x0], #16 + // st1 {v7.4s}, [x0], #16 + // st1 {v19.4s}, [x0], #16 + + add x1, x2, x4 sub x5, x5, #4 - cmp x5, #4 + cmp w5, #4 bge LoopL4 LoopEL3: - cmp x5, #3 + cmp w5, #3 blt LoopEL2 MAIN_TRANSPOSE - st1 {v0.4s}, [x0], #16 - st1 {v4.4s}, [x0], #16 - st1 {v16.4s}, [x0], #16 + stp q0, q1, [x0] + stp q2, q3, [x0, #(32 * 1)] + stp q4, q5, [x0, #(32 * 2)] + stp q6, q7, [x0, #(32 * 3)] + str q16, [x0, #(32 * 4)] + add x0, x0, #(32 * 4 + 16) - st1 {v1.4s}, [x0], #16 - st1 {v5.4s}, [x0], #16 - st1 {v17.4s}, [x0], #16 + // st1 {v0.4s}, [x0], #16 + // st1 {v4.4s}, [x0], #16 + // st1 {v16.4s}, [x0], #16 +// + // st1 {v1.4s}, [x0], #16 + // st1 {v5.4s}, [x0], #16 + // st1 {v17.4s}, [x0], #16 +// + // st1 {v2.4s}, [x0], #16 + // st1 {v6.4s}, [x0], #16 + // st1 {v18.4s}, [x0], #16 - st1 {v2.4s}, [x0], #16 - st1 {v6.4s}, [x0], #16 - st1 {v18.4s}, [x0], #16 - - sub x5, x5, #3 + b LoopEEnd LoopEL2: - cmp x5, #2 + cmp w5, #2 blt LoopEL1 MAIN_TRANSPOSE - st1 {v0.4s}, [x0], #16 - st1 {v4.4s}, [x0], #16 - st1 {v16.4s}, [x0], #16 + stp q0, q1, [x0] + stp q2, q3, [x0, #(32 * 1)] + stp q4, q5, [x0, #(32 * 2)] + add x0, x0, #(32 * 3) - st1 {v1.4s}, [x0], #16 - st1 {v5.4s}, [x0], #16 - st1 {v17.4s}, [x0], #16 - sub x5, x5, #2 + // st1 {v0.4s}, [x0], #16 + // st1 {v4.4s}, [x0], #16 + // st1 {v16.4s}, [x0], #16 +// + // st1 {v1.4s}, [x0], #16 + // st1 {v5.4s}, [x0], #16 + // st1 {v17.4s}, [x0], #16 + b LoopEEnd LoopEL1: - cmp x5, #1 + cmp w5, #1 blt LoopEEnd MAIN_TRANSPOSE - st1 {v0.4s}, [x0], #16 - st1 {v4.4s}, [x0], #16 - st1 {v16.4s}, [x0], #16 + + stp q0, q1, [x0] + str q2, [x0, #32] + add x0, x0, #(32 + 16) + + // st1 {v0.4s}, [x0], #16 + // st1 {v4.4s}, [x0], #16 + // st1 {v16.4s}, [x0], #16 LoopEEnd: +b End - sub x2, x2, #12 - cmp x2, #12 - add x0, x6, x13 - add x1, x7, #192 // 12 * 4 * sizeof(float) - bge LoopE12 - -cmp x2, #0 -beq End Right: -add x4, x4, #192 LoopE1: - mov x6, x0 + mov w9, w5 mov x7, x1 - mov x5, x3 - cmp x5, #4 + mov x8, x0 + cmp w5, #4 blt LoopE1L3 LoopE1L4: ld1 {v0.4s}, [x1], x4 - st1 {v0.s}[0], [x0], x8 - st1 {v0.s}[1], [x0], x8 - st1 {v0.s}[2], [x0], x8 - st1 {v0.s}[3], [x0], x8 - sub x5, x5, #4 - cmp x5, #4 + st1 {v0.s}[0], [x0], x11 + st1 {v0.s}[1], [x0], x11 + st1 {v0.s}[2], [x0], x11 + st1 {v0.s}[3], [x0], x11 + sub w5, w5, #4 + cmp w5, #4 bge LoopE1L4 LoopE1L3: - cmp x5, #3 + cmp w5, #3 blt LoopE1L2 ld1 {v0.4s}, [x1], x4 - st1 {v0.s}[0], [x0], x8 - st1 {v0.s}[1], [x0], x8 - st1 {v0.s}[2], [x0], x8 - sub x5, x5, #3 + st1 {v0.s}[0], [x0], x11 + st1 {v0.s}[1], [x0], x11 + st1 {v0.s}[2], [x0], x11 + + sub w5, w5, #3 LoopE1L2: - cmp x5, #2 + cmp w5, #2 blt LoopE1L1 - ld1 {v0.d}[0], [x1], x4 - st1 {v0.s}[0], [x0], x8 - st1 {v0.s}[1], [x0], x8 - sub x5, x5, #2 + ld1 {v0.2s}, [x1], x4 + st1 {v0.s}[0], [x0], x11 + st1 {v0.s}[1], [x0], x11 + sub w5, w5, #2 LoopE1L1: - cmp x5, #1 + cmp w5, #1 blt LoopE1End ld1 {v0.s}[0], [x1], x4 - st1 {v0.s}[0], [x0], x8 + st1 {v0.s}[0], [x0], x11 LoopE1End: - subs x2, x2, #1 - add x0, x6, #4 - add x1, x7, #16 // 4 * sizeof(float) + subs w2, w2, #1 + add x0, x8, x12 + add x1, x7, x6 + mov w5, w9 bne LoopE1 End: +mov x0, x13 +mov x1, x14 +subs w10, w10, #1 +add x3, x3, #16 +add x1, x1, #8 +bne LoopNumber ret diff --git a/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A_BF16.S b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A_BF16.S new file mode 100644 index 00000000..1feef787 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNPackC4ForMatMul_A_BF16.S @@ -0,0 +1,260 @@ + +// +// NEON_MNNPackC4ForMatMul_A_BF16.S +// MNN +// +// Created by MNN on 2021/02/26. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// +#ifdef __aarch64__ +#include "MNNAsmGlobal.h" + +.macro transpose_4x4 x0, x1, x2, x3, x5, x6 // transpose 4x4 of sizeof(int16_t), only low half simd vector is valid. + trn1 \x5\().4h, \x0\().4h, \x1\().4h + trn2 \x1\().4h, \x0\().4h, \x1\().4h + trn1 \x6\().4h, \x2\().4h, \x3\().4h + trn2 \x3\().4h, \x2\().4h, \x3\().4h + trn1 \x0\().2s, \x5\().2s, \x6\().2s + trn2 \x2\().2s, \x5\().2s, \x6\().2s + trn1 \x6\().2s, \x1\().2s, \x3\().2s + trn2 \x3\().2s, \x1\().2s, \x3\().2s + mov \x1\().8b, \x6\().8b +.endm + +.text +.align 5 +asm_function NEON_MNNPackC4ForMatMul_A_BF16 +// treate float pointer as int16_t* +//void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) +//Auto: x0: dest, x1:sourceGroup, x2: info, x3:el +ldr w10, [x2, #0] // number +mov x4, #0 +mov x11, #0 +mov x6, #0 +ldr w4, [x2, #4] // eReal +ldr w11, [x2, #8] // eDest +ldr w6, [x2, #12] // xOffset +// xOffset -> xOffset * 4 * sizeof(int16_t) +// eReal -> eReal * 4 * sizeof(int16_t) +// eDest -> eDest * sizeof(int16_t) +mov x12, #2 // sizeof(int16_t). kept as a const +mov x9, #8 +mul x4, x9, x4 +mul x11, x12, x11 +mul x6, x9, x6 + +LoopNumber: +mov x2, #0 +mov x5, #0 +mov x8, #0 +mov x7, #0 +ldr w5, [x3, #4] // l +ldr w8, [x3, #8] // eOffset +ldr w7, [x3, #12] // lOffset + +mov x13, x0 +mov x14, x1 +ldr x1, [x1, #0] + +// Compute dest ptr: x0 = x0 + eOffset * sizeof(int16_t) + lOffset * eDest * sizeof(int16_t) +mul x7, x11, x7 +mul x8, x12, x8 +add x0, x0, x7 +add x0, x0, x8 + +ldr w2, [x3, #0] // e + +Body: +cmp w2, #12 // original eDest +blt Right + cmp w5, #4 + blt LoopEL3 + LoopL4: + mov x2, x1 +.macro MAIN_TRANSPOSE + ld1 {v0.4h}, [x1], x6 // load size: 4 * sizeof(int16_t), jump one stride line as x6 + ld1 {v3.4h}, [x1], x6 + ld1 {v6.4h}, [x1], x6 + ld1 {v17.4h}, [x1], x6 + ld1 {v1.4h}, [x1], x6 + ld1 {v4.4h}, [x1], x6 + ld1 {v7.4h}, [x1], x6 + ld1 {v18.4h}, [x1], x6 + ld1 {v2.4h}, [x1], x6 + ld1 {v5.4h}, [x1], x6 + ld1 {v16.4h}, [x1], x6 + ld1 {v19.4h}, [x1], x6 + + transpose_4x4 v0, v3, v6, v17, v23, v24 + transpose_4x4 v1, v4, v7, v18, v25, v26 + transpose_4x4 v2, v5, v16, v19, v27, v28 +.endm + MAIN_TRANSPOSE + + stp d0, d1, [x0] // store size: 2 * 4 * sizeof(int16_t) + stp d2, d3, [x0, #(16 * 1)] + stp d4, d5, [x0, #(16 * 2)] + stp d6, d7, [x0, #(16 * 3)] + stp d16, d17, [x0, #(16 * 4)] + stp d18, d19, [x0, #(16 * 5)] + add x0, x0, #(16 * 6) + + // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t) + // st1 {v1.4h}, [x0], #8 + // st1 {v2.4h}, [x0], #8 + // st1 {v3.4h}, [x0], #8 + // st1 {v4.4h}, [x0], #8 + // st1 {v5.4h}, [x0], #8 + // st1 {v6.4h}, [x0], #8 + // st1 {v7.4h}, [x0], #8 + // st1 {v16.4h}, [x0], #8 + // st1 {v17.4h}, [x0], #8 + // st1 {v18.4h}, [x0], #8 + // st1 {v19.4h}, [x0], #8 + + // st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + // st1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x0], #32 + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 + + add x1, x2, x4 + sub x5, x5, #4 + cmp w5, #4 + bge LoopL4 + + LoopEL3: + cmp w5, #3 + blt LoopEL2 + MAIN_TRANSPOSE + + stp d0, d1, [x0] // store size: 2 * 4 * sizeof(int16_t) + stp d2, d3, [x0, #(16 * 1)] + stp d4, d5, [x0, #(16 * 2)] + stp d6, d7, [x0, #(16 * 3)] + str d16, [x0, #(16 * 4)] + add x0, x0, #(16 * 4 + 8) + + // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t) + // st1 {v1.4h}, [x0], #8 + // st1 {v2.4h}, [x0], #8 + // st1 {v3.4h}, [x0], #8 + // st1 {v4.4h}, [x0], #8 + // st1 {v5.4h}, [x0], #8 + // st1 {v6.4h}, [x0], #8 + // st1 {v7.4h}, [x0], #8 + // st1 {v16.4h}, [x0], #8 + + // st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + // st1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x0], #32 + // st1 {v16.4h}, [x0], #8 + + b LoopEEnd + + LoopEL2: + cmp w5, #2 + blt LoopEL1 + MAIN_TRANSPOSE + stp d0, d1, [x0] // store size: 2 * 4 * sizeof(int16_t) + stp d2, d3, [x0, #(16 * 1)] + stp d4, d5, [x0, #(16 * 2)] + add x0, x0, #(16 * 3) + + // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t) + // st1 {v1.4h}, [x0], #8 + // st1 {v2.4h}, [x0], #8 + // st1 {v3.4h}, [x0], #8 + // st1 {v4.4h}, [x0], #8 + // st1 {v5.4h}, [x0], #8 + + // st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + // st1 {v4.4h, v5.4h}, [x0], #16 + + b LoopEEnd + + LoopEL1: + cmp w5, #1 + blt LoopEEnd + MAIN_TRANSPOSE + stp d0, d1, [x0] + str d2, [x0, #16] + add x0, x0, #(16 + 8) + + // st1 {v0.4h}, [x0], #8 // store size: 4 * sizeof(int16_t) + // st1 {v1.4h}, [x0], #8 + // st1 {v2.4h}, [x0], #8 + + // st1 {v0.4h, v1.4h, v2.4h}, [x0], #24 + + LoopEEnd: + +b End + + +Right: + +LoopE1: + mov w9, w5 + mov x7, x1 + mov x8, x0 + cmp w5, #4 + blt LoopE1L3 + LoopE1L4: + ld1 {v0.4h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + st1 {v0.h}[3], [x0], x11 + sub w5, w5, #4 + cmp w5, #4 + bge LoopE1L4 + + LoopE1L3: + cmp w5, #3 + blt LoopE1L2 + ld1 {v0.4h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + st1 {v0.h}[2], [x0], x11 + + sub w5, w5, #3 + + LoopE1L2: + cmp w5, #2 + blt LoopE1L1 + ld1 {v0.4h}, [x1], x4 + st1 {v0.h}[0], [x0], x11 + st1 {v0.h}[1], [x0], x11 + sub w5, w5, #2 + + LoopE1L1: + cmp w5, #1 + blt LoopE1End + ld1 {v0.h}[0], [x1], x4 + st1 {v0.h}[0], [x0], x11 + + LoopE1End: + + subs w2, w2, #1 + add x0, x8, x12 // !!!! caution : sizeof(int16_t) + add x1, x7, x6 + mov w5, w9 + bne LoopE1 + +End: + +mov x0, x13 +mov x1, x14 +subs w10, w10, #1 + +// x3 is (const int32_t* el), this array size of 4. as a result for next struct element, +// address added by 4 * sizeof(int32_t) +add x3, x3, #16 + +// x1 is (const int16_t** sourceGroup), even though data content is int16_t, +// the element in sourceGroup in 'int16_t*', as a result for next struct element, +// value added by sizeof(void*) +add x1, x1, #8 +bne LoopNumber + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNPackC4_BF16.S b/source/backend/cpu/arm/arm64/MNNPackC4_BF16.S new file mode 100644 index 00000000..65857c70 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNPackC4_BF16.S @@ -0,0 +1,172 @@ +// +// MNNPackC4_BF16.S +// MNN +// +// Created by MNN on 2021/02/24. +// Copyright © 2018-2021 Alibaba Group Holding Limited. +// + +#ifdef __aarch64__ +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNPackC4_BF16 +//void MNNPackC4_BF16(float* dst, const float* src, size_t area, size_t depth) +//Auto load: +//x0:dst, x1:src, x2:area, x3:depth +mul x4, x2, x3 +cmp x4, #0 +beq UpEnd + + +//x4: srcDepthOffset:area*sizeof(float) +mov x4, #2 // sizeof(int16_t) +mul x4, x2, x4 + +UpL4: +cmp x3, #3 +ble UpL3 + +UpL4Loop: +add x5, x1, x4 +add x6, x4, x5 +add x7, x4, x6 +mov x8, x2 +cmp x8, #3 +ble UpL4AreaRemain +UpL4AreaLoop: +ld1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) +ld1 {v1.4h}, [x5], #8 +ld1 {v2.4h}, [x6], #8 +ld1 {v3.4h}, [x7], #8 + +st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +sub x8, x8, #4 +cmp x8, #4 +bge UpL4AreaLoop + +UpL4AreaRemain: +cmp x8, #0 +beq UpL4AreaRemainEnd +UpL4AreaRemainLoop: +ld1 {v0.h}[0], [x1], #2 // sizeof(int16_t) +ld1 {v0.h}[1], [x5], #2 +ld1 {v0.h}[2], [x6], #2 +ld1 {v0.h}[3], [x7], #2 + +st1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) + +subs x8, x8, #1 +bne UpL4AreaRemainLoop +UpL4AreaRemainEnd: +sub x3, x3, #4 +mov x1, x7 +cmp x3, #4 +bge UpL4Loop + +UpL3: +cmp x3, #2 +ble UpL2 +add x5, x1, x4 +add x6, x4, x5 +mov x8, x2 +cmp x8, #3 +ble UpL3AreaRemain +UpL3AreaLoop: +ld1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) +movi v3.4h, #0 +ld1 {v1.4h}, [x5], #8 +ld1 {v2.4h}, [x6], #8 + +st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +sub x8, x8, #4 +cmp x8, #4 +bge UpL3AreaLoop + +cmp x8, #0 +beq UpL3AreaRemainEnd +UpL3AreaRemain: +movi v0.4h, #0 +ld1 {v0.h}[0], [x1], #2 // sizeof(int16_t) +ld1 {v0.h}[1], [x5], #2 +ld1 {v0.h}[2], [x6], #2 + +st1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) + +subs x8, x8, #1 +bne UpL3AreaRemain + +UpL3AreaRemainEnd: +sub x3, x3, #3 + + +UpL2: +cmp x3, #1 +ble UpL1 +add x5, x1, x4 +mov x8, x2 +cmp x8, #3 +ble UpL2AreaRemain +UpL2AreaLoop: +ld1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) +movi v3.4h, #0 +ld1 {v1.4h}, [x5], #8 +movi v2.4h, #0 + +st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +sub x8, x8, #4 +cmp x8, #4 +bge UpL2AreaLoop + +cmp x8, #0 +beq UpL2AreaRemainEnd +UpL2AreaRemain: +movi v0.4s, #0 +ld1 {v0.h}[0], [x1], #2 // 2 * sizeof(int16_t) +ld1 {v0.h}[1], [x5], #2 + +st1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) + +subs x8, x8, #1 +bne UpL2AreaRemain + +UpL2AreaRemainEnd: +sub x3, x3, #2 + +UpL1: +cmp x3, #0 +beq UpEnd +mov x8, x2 +cmp x8, #3 +ble UpL1AreaRemain +UpL1AreaLoop: +ld1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) +movi v3.4h, #0 +movi v1.4h, #0 +movi v2.4h, #0 + +st4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +sub x8, x8, #4 +cmp x8, #4 +bge UpL1AreaLoop + +cmp x8, #0 +beq UpL1AreaRemainEnd +UpL1AreaRemain: +movi v0.4h, #0 +ld1 {v0.h}[0], [x1], #2 // sizeof(int16_t) + +st1 {v0.4h}, [x0], #8 //4 * sizeof(int16_t) + +subs x8, x8, #1 +bne UpL1AreaRemain + +UpL1AreaRemainEnd: + +UpEnd: + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNPackC8_BF16.S b/source/backend/cpu/arm/arm64/MNNPackC8_BF16.S new file mode 100644 index 00000000..87503e83 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNPackC8_BF16.S @@ -0,0 +1,126 @@ +// +// MNNPackC8_BF16.S +// MNN +// +// Created by MNN on 2021/02/20. +// Copyright © 2018-2021 Alibaba Group Holding Limited. +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + + +.text +.align 5 +asm_function MNNPackC8_BF16 +// treate float pointer as int16_t* +//void MNNPackC8_BF16(float* dest, const float* source, size_t l, size_t h); +// h, l -> hC8, l, 8 +// Auto: x0:dest, x1:source, x2: l, x3: h +// x4: lC8, x5:hC8, x6: sourceStride, x7: destStride + +lsr x4, x2, #3 +lsr x5, x3, #3 +mov x12, #2 // sizeof(int16_t) +mov x13, #16 // 8 * sizeof(int16_t) +mul x6, x12, x2 +mul x7, x13, x2 +mov x12, #16 // 8 * sizeof(int16_t) +mul x15, x12, x2 + +.macro transpose_4x4 x0, x1, x2, x3, x5, x6 + trn1 \x5\().4s, \x0\().4s, \x1\().4s + trn2 \x1\().4s, \x0\().4s, \x1\().4s + trn1 \x6\().4s, \x2\().4s, \x3\().4s + trn2 \x3\().4s, \x2\().4s, \x3\().4s + trn1 \x0\().2d, \x5\().2d, \x6\().2d + trn2 \x2\().2d, \x5\().2d, \x6\().2d + trn1 \x6\().2d, \x1\().2d, \x3\().2d + trn2 \x3\().2d, \x1\().2d, \x3\().2d + mov \x1\().16b, \x6\().16b +.endm + +LoopH: +mov x8, x0 +mov x9, x1 +mov x12, x4 + +LoopL: +mov x10, x9 +ld1 {v16.4h, v17.4h}, [x9], x6 +ld1 {v18.4h, v19.4h}, [x9], x6 +ld1 {v20.4h, v21.4h}, [x9], x6 +ld1 {v22.4h, v23.4h}, [x9], x6 + +ld1 {v24.4h, v25.4h}, [x9], x6 +ld1 {v26.4h, v27.4h}, [x9], x6 +ld1 {v28.4h, v29.4h}, [x9], x6 +ld1 {v30.4h, v31.4h}, [x9], x6 + +shll v16.4s, v16.4h, #16 +shll v17.4s, v17.4h, #16 +shll v18.4s, v18.4h, #16 +shll v19.4s, v19.4h, #16 +shll v20.4s, v20.4h, #16 +shll v21.4s, v21.4h, #16 +shll v22.4s, v22.4h, #16 +shll v23.4s, v23.4h, #16 +shll v24.4s, v24.4h, #16 +shll v25.4s, v25.4h, #16 +shll v26.4s, v26.4h, #16 +shll v27.4s, v27.4h, #16 +shll v28.4s, v28.4h, #16 +shll v29.4s, v29.4h, #16 +shll v30.4s, v30.4h, #16 +shll v31.4s, v31.4h, #16 + + +transpose_4x4 v16, v18, v20, v22, v0, v1 +transpose_4x4 v17, v19, v21, v23, v2, v3 +transpose_4x4 v24, v26, v28, v30, v4, v5 +transpose_4x4 v25, v27, v29, v31, v6, v7 + + +shrn v16.4h, v16.4s, #16 +shrn v17.4h, v17.4s, #16 +shrn v18.4h, v18.4s, #16 +shrn v19.4h, v19.4s, #16 +shrn v20.4h, v20.4s, #16 +shrn v21.4h, v21.4s, #16 +shrn v22.4h, v22.4s, #16 +shrn v23.4h, v23.4s, #16 +shrn v24.4h, v24.4s, #16 +shrn v25.4h, v25.4s, #16 +shrn v26.4h, v26.4s, #16 +shrn v27.4h, v27.4s, #16 +shrn v28.4h, v28.4s, #16 +shrn v29.4h, v29.4s, #16 +shrn v30.4h, v30.4s, #16 +shrn v31.4h, v31.4s, #16 + + +stp d16, d24, [x8], #16 +stp d18, d26, [x8], #16 +stp d20, d28, [x8], #16 +stp d22, d30, [x8], #16 + +stp d17, d25, [x8], #16 +stp d19, d27, [x8], #16 +stp d21, d29, [x8], #16 +stp d23, d31, [x8], #16 + +add x9, x10, #16 // 8 * sizeof(int16_t) + +subs x12, x12, #1 +bne LoopL + + +subs x5, x5, #1 +add x0, x0, x7 +add x1, x1, x15 +bne LoopH + + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMul.S b/source/backend/cpu/arm/arm64/MNNPackedMatMul.S index f792b045..04b0a931 100644 --- a/source/backend/cpu/arm/arm64/MNNPackedMatMul.S +++ b/source/backend/cpu/arm/arm64/MNNPackedMatMul.S @@ -13,11 +13,15 @@ .align 5 // 12 * 8 MatMul asm_function MNNPackedMatMul -//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias); -// x0: C, x1:A, x2:B, x3:parameter, x4: cache, x5: postParameters, x6:bias -sub sp, sp, #128 -st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 +//void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); +// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias +// sub sp, sp, #128 +// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 +// st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 +stp d8, d9, [sp, #-16] +stp d10, d11, [sp, #-32] +stp d12, d13, [sp, #-48] +stp d14, d15, [sp, #-64] //ldr x8, [x3, #0] // deprecated ldr x9, [x3, #8] // l @@ -32,8 +36,8 @@ ldr x7, [x3, #40] // bExtraStride add x10, x10, #3 lsr x10, x10, #2 -cbz x5, Start -ld1 {v5.4s}, [x5] +cbz x4, Start +ld1 {v5.4s}, [x4] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value @@ -43,7 +47,7 @@ cmp x10, #2 blt LH4 LH8: -sub x14, x13, #128 +// sub x14, x13, #160 LoopH: mov x15, x1 subs x12, x9, #1 @@ -188,10 +192,10 @@ LoopH: sub x10, x10, #2 cmp x10, #2 - cbz x5, StoreLH8 + cbz x4, StoreLH8 AddBiasLH8: - ld1 {v0.4s, v1.4s}, [x6], #32 + ld1 {v0.4s, v1.4s}, [x5], #32 fmla v8.4s, v0.4s, v5.s[1] fmla v9.4s, v0.4s, v5.s[1] @@ -275,14 +279,28 @@ LoopH: fmin v31.4s, v31.4s, v7.4s StoreLH8: + stp q8, q9, [x0] + stp q10, q11, [x0, #(32 * 1)] // 2 * 4 * sizeof(int16_t) + stp q12, q13, [x0, #(32 * 2)] + stp q14, q15, [x0, #(32 * 3)] + stp q16, q17, [x0, #(32 * 4)] + stp q18, q19, [x0, #(32 * 5)] + add x0, x0, x13 // stp donot support post-index offset in register + stp q20, q21, [x0] + stp q22, q23, [x0, #(32 * 1)] + stp q24, q25, [x0, #(32 * 2)] + stp q26, q27, [x0, #(32 * 3)] + stp q28, q29, [x0, #(32 * 4)] + stp q30, q31, [x0, #(32 * 5)] + add x0, x0, x13 - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64 - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64 - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x14 - - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 - st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 - st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14 + // st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64 + // st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64 + // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x14 +// + // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + // st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 + // st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14 bge LoopH @@ -334,9 +352,9 @@ LoopHRemain: bne LoopLR LoopLREnd: - cbz x5, StoreLH4 + cbz x4, StoreLH4 AddBiasLH4: - ld1 {v0.4s}, [x6], #16 + ld1 {v0.4s}, [x5], #16 fmla v8.4s, v0.4s, v5.s[1] fmla v9.4s, v0.4s, v5.s[1] @@ -381,17 +399,28 @@ LoopHRemain: fmin v19.4s, v19.4s, v7.4s StoreLH4: + stp q8, q9, [x0] + stp q10, q11, [x0, #(32 * 1)] // 2 * 4 * sizeof(float) + stp q12, q13, [x0, #(32 * 2)] + stp q14, q15, [x0, #(32 * 3)] + stp q16, q17, [x0, #(32 * 4)] + stp q18, q19, [x0, #(32 * 5)] - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64 - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64 - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + // st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64 + // st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64 + // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] sub x10, x10, #1 End: -sub sp, sp, #128 -ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 -ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + +// sub sp, sp, #128 +// ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 +// ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 +ldp d8, d9, [sp, #-16] +ldp d10, d11, [sp, #-32] +ldp d12, d13, [sp, #-48] +ldp d14, d15, [sp, #-64] ret diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S index 35939dcb..e2a8ffd3 100644 --- a/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S +++ b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain.S @@ -14,8 +14,8 @@ .align 5 // 12 * 8 MatMul asm_function MNNPackedMatMulRemain -//void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias); -//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x5: cache, x6:postParameters, x7:bias +//void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); +//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x7: cache, x5:postParameters, x6:bias sub sp, sp, #32 str x19, [sp, #0] str x20, [sp, #8] @@ -25,14 +25,14 @@ ldr x11, [x4, #0] // aStride ldr x9, [x4, #8] // l ldr x10, [x4, #16] // h -ldr x5, [x4, #24] // cStride +ldr x7, [x4, #24] // cStride ldr x19, [x4, #40] // bExtraStride add x10, x10, #3 lsr x10, x10, #2 -cbz x6, Start -ld1 {v5.4s}, [x6] +cbz x5, Start +ld1 {v5.4s}, [x5] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value @@ -43,7 +43,7 @@ cmp x3, #8 blt E4 LoopE8: - mov x20, x7 + mov x20, x6 mov x8, x10 mov x21, x0 mov x13, x2 @@ -51,7 +51,7 @@ LoopE8: LH8: cmp x8, #2 blt LH4 - sub x14, x5, #64 + // sub x14, x7, #64 LoopH8x8: mov x15, x1 subs x12, x9, #1 @@ -110,7 +110,7 @@ LoopE8: sub x8, x8, #2 cmp x8, #2 - cbz x6, StoreLH8 + cbz x5, StoreLH8 AddBiasLH8: ld1 {v0.4s, v1.4s}, [x20], #32 @@ -170,11 +170,22 @@ LoopE8: fmin v31.4s, v31.4s, v7.4s StoreLH8: - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 - st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], x14 + stp q16, q17, [x0] + stp q18, q19, [x0, #(32 * 1)] + stp q24, q25, [x0, #(32 * 2)] + stp q26, q27, [x0, #(32 * 3)] + add x0, x0, x7 // stp donot support post-index offset in register - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 - st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14 + stp q20, q21, [x0] + stp q22, q23, [x0, #(32 * 1)] + stp q28, q29, [x0, #(32 * 2)] + stp q30, q31, [x0, #(32 * 3)] + add x0, x0, x7 + + // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + // st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], x14 + // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + // st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x14 bge LoopH8x8 @@ -220,7 +231,7 @@ LoopE8: bne LoopLR LoopLREnd: - cbz x6, StoreLH8x4 + cbz x5, StoreLH8x4 AddBiasLH8x4: ld1 {v0.4s}, [x20] @@ -233,7 +244,7 @@ LoopE8: fmla v21.4s, v0.4s, v5.s[1] fmla v22.4s, v0.4s, v5.s[1] fmla v23.4s, v0.4s, v5.s[1] - + PostTreatLH8x4: fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s @@ -255,8 +266,14 @@ LoopE8: StoreLH8x4: - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + stp q16, q17, [x0] + stp q18, q19, [x0, #(32 * 1)] + stp q20, q21, [x0, #(32 * 2)] + stp q22, q23, [x0, #(32 * 3)] + add x0, x0, #(32 * 4) + + // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 E8End: @@ -268,7 +285,7 @@ LoopE8: E4: cmp x3, #4 -mov x20, x7 +mov x20, x6 blt E1 mov x8, x10 mov x21, x0 @@ -300,7 +317,7 @@ blt E1 ld1 {v0.4s}, [x15], x11 fmla v16.4s, v3.4s, v0.s[0] fmla v17.4s, v3.4s, v0.s[1] - + beq E4LoopLComputeEnd E4LoopL: @@ -333,7 +350,7 @@ blt E1 sub x8, x8, #2 cmp x8, #2 - cbz x6, StoreLH4x8 + cbz x5, StoreLH4x8 AddBiasLH4x8: ld1 {v0.4s, v1.4s}, [x20], #32 @@ -347,7 +364,7 @@ blt E1 fmla v21.4s, v1.4s, v5.s[1] fmla v22.4s, v1.4s, v5.s[1] fmla v23.4s, v1.4s, v5.s[1] - + PostTreatLH4x8: fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s @@ -368,9 +385,15 @@ blt E1 fmin v23.4s, v23.4s, v7.4s StoreLH4x8: + stp q16, q17, [x0] + stp q18, q19, [x0, #32] + add x0, x0, x7 // stp donot support post-index offset in register + stp q20, q21, [x0] + stp q22, q23, [x0, #32] + add x0, x0, x7 - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x5 - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], x5 + // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x7 + // st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], x7 bge E4LoopH8 @@ -401,7 +424,7 @@ blt E1 bne E4LoopLR E4LoopLREnd: - cbz x6, StoreLH4x4 + cbz x5, StoreLH4x4 AddBiasLH4x4: ld1 {v0.4s}, [x20] @@ -410,7 +433,7 @@ blt E1 fmla v18.4s, v0.4s, v5.s[1] fmla v19.4s, v0.4s, v5.s[1] - + PostTreatLH4x4: fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s @@ -423,7 +446,9 @@ blt E1 fmin v19.4s, v19.4s, v7.4s StoreLH4x4: - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + stp q16, q17, [x0] + stp q18, q19, [x0, #32] + // st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] E4End: @@ -436,7 +461,7 @@ cmp x3, #0 beq End LoopE1: - mov x20, x7 + mov x20, x6 mov x8, x10 mov x21, x0 mov x13, x2 @@ -470,13 +495,13 @@ LoopE1: sub x8, x8, #2 cmp x8, #2 - cbz x6, StoreLH1x8 + cbz x5, StoreLH1x8 AddBiasLH1x8: ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] fmla v20.4s, v1.4s, v5.s[1] - + PostTreatLH1x8: fmax v16.4s, v16.4s, v6.4s fmax v20.4s, v20.4s, v6.4s @@ -485,8 +510,8 @@ LoopE1: StoreLH1x8: - st1 {v16.4s}, [x0], x5 - st1 {v20.4s}, [x0], x5 + st1 {v16.4s}, [x0], x7 + st1 {v20.4s}, [x0], x7 bge E1LoopH8 @@ -511,11 +536,11 @@ LoopE1: bne E1LoopLR E1LoopLREnd: - cbz x6, StoreLH1x4 + cbz x5, StoreLH1x4 AddBiasLH1x4: ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] - + PostTreatLH1x4: fmax v16.4s, v16.4s, v6.4s fmin v16.4s, v16.4s, v7.4s diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain_BF16.S b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain_BF16.S new file mode 100644 index 00000000..07ec6aad --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNPackedMatMulRemain_BF16.S @@ -0,0 +1,674 @@ +// +// MNNPackedMatMulRemain_BF16.S +// MNN +// +// Created by MNN on 2021/02/21. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 +// 12 * 8 MatMul +asm_function NEON_MNNPackedMatMulRemain_BF16 +//void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); +//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x5:postParameters, x6:bias +sub sp, sp, #32 +str x19, [sp, #0] +str x20, [sp, #8] +str x21, [sp, #16] +add sp, sp, #32 +ldr x11, [x4, #0] // aStride +ldr x9, [x4, #8] // l +ldr x10, [x4, #16] // h + +ldr x7, [x4, #24] // cStride +ldr x19, [x4, #40] // bExtraStride + +add x10, x10, #3 +lsr x10, x10, #2 + +cbz x5, Start +ld1 {v5.4s}, [x5] +dup v6.4s, v5.s[2] // Min Value +dup v7.4s, v5.s[3] // Max Value + +Start: + +E8: +cmp x3, #8 +blt E4 + +LoopE8: // e, TILE_BLOCK size is 8 + mov x20, x6 // bias + mov x8, x10 // updiv(h, 4) + mov x21, x0 // dest, C + mov x13, x2 // weight, B + + LH8: + cmp x8, #2 // h/4 > 2 + blt LH4 + // sub x14, x7, #32 // in "StoreLH8", total 2 lines stride is x14, first line is 4 * 4 * size_t(int16_t) = 32byte + LoopH8x8: + mov x15, x1 // src, A + subs x12, x9, #1 // l + ld1 {v3.4h, v4.4h}, [x13], #16 // 2 * 4 * sizeof(int16_t) + ld1 {v0.4h, v1.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + + fmul v16.4s, v3.4s, v0.s[0] + fmul v17.4s, v3.4s, v0.s[1] + fmul v18.4s, v3.4s, v0.s[2] + fmul v19.4s, v3.4s, v0.s[3] + + fmul v20.4s, v4.4s, v0.s[0] + fmul v21.4s, v4.4s, v0.s[1] + fmul v22.4s, v4.4s, v0.s[2] + fmul v23.4s, v4.4s, v0.s[3] + + fmul v24.4s, v3.4s, v1.s[0] + fmul v25.4s, v3.4s, v1.s[1] + fmul v26.4s, v3.4s, v1.s[2] + fmul v27.4s, v3.4s, v1.s[3] + + fmul v28.4s, v4.4s, v1.s[0] + fmul v29.4s, v4.4s, v1.s[1] + fmul v30.4s, v4.4s, v1.s[2] + fmul v31.4s, v4.4s, v1.s[3] + beq LoopLEnd + + LoopL: + ld1 {v3.4h, v4.4h}, [x13], #16 // 2 * 4 * sizeof(int16_t) + ld1 {v0.4h, v1.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + fmla v18.4s, v3.4s, v0.s[2] + fmla v19.4s, v3.4s, v0.s[3] + + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + + fmla v24.4s, v3.4s, v1.s[0] + fmla v25.4s, v3.4s, v1.s[1] + fmla v26.4s, v3.4s, v1.s[2] + fmla v27.4s, v3.4s, v1.s[3] + + fmla v28.4s, v4.4s, v1.s[0] + fmla v29.4s, v4.4s, v1.s[1] + fmla v30.4s, v4.4s, v1.s[2] + fmla v31.4s, v4.4s, v1.s[3] + + subs x12, x12, #1 + bne LoopL + + LoopLEnd: + + add x13, x13, x19 + sub x8, x8, #2 + cmp x8, #2 + + cbz x5, StoreLH8 + AddBiasLH8: + ld1 {v0.4h, v1.4h}, [x20], #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + + fmla v16.4s, v0.4s, v5.s[1] + fmla v17.4s, v0.4s, v5.s[1] + fmla v18.4s, v0.4s, v5.s[1] + fmla v19.4s, v0.4s, v5.s[1] + + fmla v20.4s, v1.4s, v5.s[1] + fmla v21.4s, v1.4s, v5.s[1] + fmla v22.4s, v1.4s, v5.s[1] + fmla v23.4s, v1.4s, v5.s[1] + + fmla v24.4s, v0.4s, v5.s[1] + fmla v25.4s, v0.4s, v5.s[1] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v0.4s, v5.s[1] + + fmla v28.4s, v1.4s, v5.s[1] + fmla v29.4s, v1.4s, v5.s[1] + fmla v30.4s, v1.4s, v5.s[1] + fmla v31.4s, v1.4s, v5.s[1] + + PostTreatLH8: + fmax v16.4s, v16.4s, v6.4s + fmax v17.4s, v17.4s, v6.4s + fmax v18.4s, v18.4s, v6.4s + fmax v19.4s, v19.4s, v6.4s + fmax v20.4s, v20.4s, v6.4s + fmax v21.4s, v21.4s, v6.4s + fmax v22.4s, v22.4s, v6.4s + fmax v23.4s, v23.4s, v6.4s + fmax v24.4s, v24.4s, v6.4s + fmax v25.4s, v25.4s, v6.4s + fmax v26.4s, v26.4s, v6.4s + fmax v27.4s, v27.4s, v6.4s + fmax v28.4s, v28.4s, v6.4s + fmax v29.4s, v29.4s, v6.4s + fmax v30.4s, v30.4s, v6.4s + fmax v31.4s, v31.4s, v6.4s + + fmin v16.4s, v16.4s, v7.4s + fmin v17.4s, v17.4s, v7.4s + fmin v18.4s, v18.4s, v7.4s + fmin v19.4s, v19.4s, v7.4s + fmin v20.4s, v20.4s, v7.4s + fmin v21.4s, v21.4s, v7.4s + fmin v22.4s, v22.4s, v7.4s + fmin v23.4s, v23.4s, v7.4s + fmin v24.4s, v24.4s, v7.4s + fmin v25.4s, v25.4s, v7.4s + fmin v26.4s, v26.4s, v7.4s + fmin v27.4s, v27.4s, v7.4s + fmin v28.4s, v28.4s, v7.4s + fmin v29.4s, v29.4s, v7.4s + fmin v30.4s, v30.4s, v7.4s + fmin v31.4s, v31.4s, v7.4s + + StoreLH8: + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + shrn v20.4h, v20.4s, #16 + shrn v21.4h, v21.4s, #16 + shrn v22.4h, v22.4s, #16 + shrn v23.4h, v23.4s, #16 + shrn v24.4h, v24.4s, #16 + shrn v25.4h, v25.4s, #16 + shrn v26.4h, v26.4s, #16 + shrn v27.4h, v27.4s, #16 + shrn v28.4h, v28.4s, #16 + shrn v29.4h, v29.4s, #16 + shrn v30.4h, v30.4s, #16 + shrn v31.4h, v31.4s, #16 + + stp d16, d17, [x0] + stp d18, d19, [x0, #(16 * 1)] + stp d24, d25, [x0, #(16 * 2)] + stp d26, d27, [x0, #(16 * 3)] + add x0, x0, x7 // stp donot support post-index offset in register + + stp d20, d21, [x0] + stp d22, d23, [x0, #(16 * 1)] + stp d28, d29, [x0, #(16 * 2)] + stp d30, d31, [x0, #(16 * 3)] + add x0, x0, x7 // stp donot support post-index offset in register + + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 // 4 * 4 * sizeof(int16_t) + // st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], x14 + // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 + // st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x0], x14 + + bge LoopH8x8 + + LH4: + cbz x8, E8End + LoopHRemain: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h}, [x13] + ld1 {v0.4h}, [x15], #8 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmul v16.4s, v3.4s, v0.s[0] + fmul v17.4s, v3.4s, v0.s[1] + add x13, x13, #16 // weight + ld1 {v1.4h}, [x15] + shll v1.4s, v1.4h, #16 + + fmul v18.4s, v3.4s, v0.s[2] + sub x15, x15, #8 + fmul v19.4s, v3.4s, v0.s[3] + add x15, x15, x11 + fmul v20.4s, v3.4s, v1.s[0] + fmul v21.4s, v3.4s, v1.s[1] + fmul v22.4s, v3.4s, v1.s[2] + fmul v23.4s, v3.4s, v1.s[3] + beq LoopLREnd + + LoopLR: + ld1 {v3.4h}, [x13] + ld1 {v0.4h}, [x15], #8 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + add x13, x13, #16 // weight + ld1 {v1.4h}, [x15] + shll v1.4s, v1.4h, #16 + + fmla v18.4s, v3.4s, v0.s[2] + sub x15, x15, #8 + fmla v19.4s, v3.4s, v0.s[3] + add x15, x15, x11 + + fmla v20.4s, v3.4s, v1.s[0] + fmla v21.4s, v3.4s, v1.s[1] + fmla v22.4s, v3.4s, v1.s[2] + fmla v23.4s, v3.4s, v1.s[3] + + subs x12, x12, #1 + bne LoopLR + LoopLREnd: + + cbz x5, StoreLH8x4 + AddBiasLH8x4: + ld1 {v0.4h}, [x20] + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v0.4s, v5.s[1] + fmla v17.4s, v0.4s, v5.s[1] + fmla v18.4s, v0.4s, v5.s[1] + fmla v19.4s, v0.4s, v5.s[1] + + fmla v20.4s, v0.4s, v5.s[1] + fmla v21.4s, v0.4s, v5.s[1] + fmla v22.4s, v0.4s, v5.s[1] + fmla v23.4s, v0.4s, v5.s[1] + + PostTreatLH8x4: + fmax v16.4s, v16.4s, v6.4s + fmax v17.4s, v17.4s, v6.4s + fmax v18.4s, v18.4s, v6.4s + fmax v19.4s, v19.4s, v6.4s + fmax v20.4s, v20.4s, v6.4s + fmax v21.4s, v21.4s, v6.4s + fmax v22.4s, v22.4s, v6.4s + fmax v23.4s, v23.4s, v6.4s + + fmin v16.4s, v16.4s, v7.4s + fmin v17.4s, v17.4s, v7.4s + fmin v18.4s, v18.4s, v7.4s + fmin v19.4s, v19.4s, v7.4s + fmin v20.4s, v20.4s, v7.4s + fmin v21.4s, v21.4s, v7.4s + fmin v22.4s, v22.4s, v7.4s + fmin v23.4s, v23.4s, v7.4s + + StoreLH8x4: + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + shrn v20.4h, v20.4s, #16 + shrn v21.4h, v21.4s, #16 + shrn v22.4h, v22.4s, #16 + shrn v23.4h, v23.4s, #16 + + stp q16, q17, [x0] + stp q18, q19, [x0, #(16 * 1)] + stp q20, q21, [x0, #(16 * 2)] + stp q22, q23, [x0, #(16 * 3)] + add x0, x0, #(16 * 4) + + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], #32 + // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 + + E8End: + + sub x3, x3, #8 + cmp x3, #8 + add x0, x21, #64 // move dest address of 8 * 4 * sizeof(int16_t) + add x1, x1, #16 // move A matrix address of 8 * sizeof(int16_t) + bge LoopE8 + +E4: +cmp x3, #4 +mov x20, x6 +blt E1 + mov x8, x10 + mov x21, x0 + mov x13, x2 + + cmp x8, #2 + blt E4LH4 + + E4LH8: + E4LoopH8: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h, v4.4h}, [x13], #16 + ld1 {v0.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + + fmul v16.4s, v3.4s, v0.s[0] + fmul v17.4s, v3.4s, v0.s[1] + fmul v18.4s, v3.4s, v0.s[2] + fmul v19.4s, v3.4s, v0.s[3] + + fmul v20.4s, v4.4s, v0.s[0] + fmul v21.4s, v4.4s, v0.s[1] + fmul v22.4s, v4.4s, v0.s[2] + fmul v23.4s, v4.4s, v0.s[3] + + beq E4LoopLEnd + + subs x12, x12, #1 + ld1 {v3.4h, v4.4h}, [x13], #16 + ld1 {v0.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + + beq E4LoopLComputeEnd + + E4LoopL: + fmla v18.4s, v3.4s, v0.s[2] + fmla v19.4s, v3.4s, v0.s[3] + + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + + ld1 {v3.4h, v4.4h}, [x13], #16 + ld1 {v0.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + + subs x12, x12, #1 + bne E4LoopL + E4LoopLComputeEnd: + fmla v18.4s, v3.4s, v0.s[2] + fmla v19.4s, v3.4s, v0.s[3] + + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + + E4LoopLEnd: + add x13, x13, x19 + sub x8, x8, #2 + cmp x8, #2 + + cbz x5, StoreLH4x8 + + AddBiasLH4x8: + ld1 {v0.4h, v1.4h}, [x20], #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + + fmla v16.4s, v0.4s, v5.s[1] + fmla v17.4s, v0.4s, v5.s[1] + fmla v18.4s, v0.4s, v5.s[1] + fmla v19.4s, v0.4s, v5.s[1] + + fmla v20.4s, v1.4s, v5.s[1] + fmla v21.4s, v1.4s, v5.s[1] + fmla v22.4s, v1.4s, v5.s[1] + fmla v23.4s, v1.4s, v5.s[1] + + PostTreatLH4x8: + fmax v16.4s, v16.4s, v6.4s + fmax v17.4s, v17.4s, v6.4s + fmax v18.4s, v18.4s, v6.4s + fmax v19.4s, v19.4s, v6.4s + fmax v20.4s, v20.4s, v6.4s + fmax v21.4s, v21.4s, v6.4s + fmax v22.4s, v22.4s, v6.4s + fmax v23.4s, v23.4s, v6.4s + + fmin v16.4s, v16.4s, v7.4s + fmin v17.4s, v17.4s, v7.4s + fmin v18.4s, v18.4s, v7.4s + fmin v19.4s, v19.4s, v7.4s + fmin v20.4s, v20.4s, v7.4s + fmin v21.4s, v21.4s, v7.4s + fmin v22.4s, v22.4s, v7.4s + fmin v23.4s, v23.4s, v7.4s + + StoreLH4x8: + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + shrn v20.4h, v20.4s, #16 + shrn v21.4h, v21.4s, #16 + shrn v22.4h, v22.4s, #16 + shrn v23.4h, v23.4s, #16 + + + stp d16, d17, [x0] + stp d18, d19, [x0, #16] + add x0, x0, x7 + stp d20, d21, [x0] + stp d22, d23, [x0, #16] + add x0, x0, x7 + + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], x7 + // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], x7 + + bge E4LoopH8 + + E4LH4: + cbz x8, E4End + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h}, [x13] + ld1 {v0.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmul v16.4s, v3.4s, v0.s[0] + fmul v17.4s, v3.4s, v0.s[1] + fmul v18.4s, v3.4s, v0.s[2] + fmul v19.4s, v3.4s, v0.s[3] + add x13, x13, #16 // weight + + beq E4LoopLREnd + + E4LoopLR: + ld1 {v3.4h}, [x13] + ld1 {v0.4h}, [x15], x11 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + fmla v18.4s, v3.4s, v0.s[2] + fmla v19.4s, v3.4s, v0.s[3] + add x13, x13, #16 // weight + + subs x12, x12, #1 + bne E4LoopLR + E4LoopLREnd: + + cbz x5, StoreLH4x4 + AddBiasLH4x4: + ld1 {v0.4h}, [x20] + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v0.4s, v5.s[1] + fmla v17.4s, v0.4s, v5.s[1] + fmla v18.4s, v0.4s, v5.s[1] + fmla v19.4s, v0.4s, v5.s[1] + + + PostTreatLH4x4: + fmax v16.4s, v16.4s, v6.4s + fmax v17.4s, v17.4s, v6.4s + fmax v18.4s, v18.4s, v6.4s + fmax v19.4s, v19.4s, v6.4s + + fmin v16.4s, v16.4s, v7.4s + fmin v17.4s, v17.4s, v7.4s + fmin v18.4s, v18.4s, v7.4s + fmin v19.4s, v19.4s, v7.4s + + StoreLH4x4: + + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + + stp d16, d17, [x0] + stp d18, d19, [x0, #16] + + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0] + + E4End: + + sub x3, x3, #4 + add x0, x21, #32 // move dest address of 4 * 4 * sizeof(int16_t) + add x1, x1, #8 // move dest address of 4 * sizeof(int16_t) + +E1: +cmp x3, #0 +beq End + +LoopE1: + mov x20, x6 + mov x8, x10 + mov x21, x0 + mov x13, x2 + + cmp x8, #2 + blt E1LH4 + + E1LH8: + E1LoopH8: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h, v4.4h}, [x13], #16 // + ld1 {v0.h}[0], [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + + fmul v16.4s, v3.4s, v0.s[0] + fmul v20.4s, v4.4s, v0.s[0] + + beq E1LoopLEnd + + E1LoopL: + ld1 {v3.4h, v4.4h}, [x13], #16 // + ld1 {v0.h}[0], [x15], x11 + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + fmla v20.4s, v4.4s, v0.s[0] + + subs x12, x12, #1 + bne E1LoopL + + E1LoopLEnd: + + add x13, x13, x19 + sub x8, x8, #2 + cmp x8, #2 + + cbz x5, StoreLH1x8 + AddBiasLH1x8: + ld1 {v0.4h, v1.4h}, [x20], #16 + shll v1.4s, v1.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v0.4s, v5.s[1] + fmla v20.4s, v1.4s, v5.s[1] + + PostTreatLH1x8: + fmax v16.4s, v16.4s, v6.4s + fmax v20.4s, v20.4s, v6.4s + fmin v16.4s, v16.4s, v7.4s + fmin v20.4s, v20.4s, v7.4s + + StoreLH1x8: + shrn v16.4h, v16.4s, #16 + shrn v20.4h, v20.4s, #16 + st1 {v16.4h}, [x0], x7 + st1 {v20.4h}, [x0], x7 + + bge E1LoopH8 + + E1LH4: + cbz x8, E1End + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h}, [x13] + ld1 {v0.h}[0], [x15], x11 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmul v16.4s, v3.4s, v0.s[0] + add x13, x13, #16 // weight + + beq E1LoopLREnd + + E1LoopLR: + ld1 {v3.4h}, [x13] + ld1 {v0.h}[0], [x15], x11 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v3.4s, v0.s[0] + add x13, x13, #16 // weight + + subs x12, x12, #1 + bne E1LoopLR + E1LoopLREnd: + + cbz x5, StoreLH1x4 + AddBiasLH1x4: + ld1 {v0.4h}, [x20] + shll v0.4s, v0.4h, #16 + + fmla v16.4s, v0.4s, v5.s[1] + + PostTreatLH1x4: + fmax v16.4s, v16.4s, v6.4s + fmin v16.4s, v16.4s, v7.4s + + StoreLH1x4: + shrn v16.4h, v16.4s, #16 + st1 {v16.4h}, [x0] + + E1End: + + subs x3, x3, #1 + add x0, x21, #8 + add x1, x1, #2 + bne LoopE1 + + +End: +sub sp, sp, #32 +ldr x19, [sp, #0] +ldr x20, [sp, #8] +ldr x21, [sp, #16] +add sp, sp, #32 + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNPackedMatMul_BF16.S b/source/backend/cpu/arm/arm64/MNNPackedMatMul_BF16.S new file mode 100644 index 00000000..85a731f7 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNPackedMatMul_BF16.S @@ -0,0 +1,507 @@ +// +// MNNPackedMatMul_BF16.S +// MNN +// +// Created by MNN on 2021/02/21. +// Copyright © 2018-2021 Alibaba Group Holding Limited +// +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + + +.text +.align 5 +// 12 * 8 MatMul +asm_function NEON_MNNPackedMatMul_BF16 +//void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); +// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias +// sub sp, sp, #128 +// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 +// st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 +stp d8, d9, [sp, #-16] +stp d10, d11, [sp, #-32] +stp d12, d13, [sp, #-48] +stp d14, d15, [sp, #-64] + +//ldr x8, [x3, #0] // deprecated +ldr x9, [x3, #8] // l +ldr x10, [x3, #16] // h + +ldr x13, [x3, #24] // cStride +ldr x7, [x3, #40] // bExtraStride + +// v0, v1, v2: A +// v3, v4: B +// v8 - v31: C +add x10, x10, #3 +lsr x10, x10, #2 + +cbz x4, Start +ld1 {v5.4s}, [x4] +dup v6.4s, v5.s[2] // Min Value +dup v7.4s, v5.s[3] // Max Value + +Start: + +cmp x10, #2 +blt LH4 + +LH8: +// sub x14, x13, #80 // in "StoreLH8", total 3 lines Cstride is x13, first 5 line stp is 5 * 8 * sizeof(int16_t) = 64byte + // stp should add at last +LoopH: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t) + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) + + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + shll v2.4s, v2.4h, #16 + + fmul v8.4s, v3.4s, v0.s[0] + fmul v9.4s, v3.4s, v0.s[1] + fmul v10.4s, v3.4s, v0.s[2] + fmul v11.4s, v3.4s, v0.s[3] + fmul v12.4s, v3.4s, v1.s[0] + fmul v13.4s, v3.4s, v1.s[1] + fmul v14.4s, v3.4s, v1.s[2] + fmul v15.4s, v3.4s, v1.s[3] + fmul v16.4s, v3.4s, v2.s[0] + fmul v17.4s, v3.4s, v2.s[1] + fmul v18.4s, v3.4s, v2.s[2] + fmul v19.4s, v3.4s, v2.s[3] + + fmul v20.4s, v4.4s, v0.s[0] + fmul v21.4s, v4.4s, v0.s[1] + fmul v22.4s, v4.4s, v0.s[2] + fmul v23.4s, v4.4s, v0.s[3] + + fmul v24.4s, v4.4s, v1.s[0] + fmul v25.4s, v4.4s, v1.s[1] + fmul v26.4s, v4.4s, v1.s[2] + fmul v27.4s, v4.4s, v1.s[3] + + fmul v28.4s, v4.4s, v2.s[0] + fmul v29.4s, v4.4s, v2.s[1] + fmul v30.4s, v4.4s, v2.s[2] + fmul v31.4s, v4.4s, v2.s[3] + + beq LoopLEnd + + cmp x12, #2 + blt L1 + LoopL2: + ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t) + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) // * sizeof(int16_t) + + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + shll v2.4s, v2.4h, #16 + + fmla v8.4s, v3.4s, v0.s[0] + fmla v9.4s, v3.4s, v0.s[1] + fmla v10.4s, v3.4s, v0.s[2] + fmla v11.4s, v3.4s, v0.s[3] + fmla v12.4s, v3.4s, v1.s[0] + fmla v13.4s, v3.4s, v1.s[1] + fmla v14.4s, v3.4s, v1.s[2] + fmla v15.4s, v3.4s, v1.s[3] + fmla v16.4s, v3.4s, v2.s[0] + fmla v17.4s, v3.4s, v2.s[1] + fmla v18.4s, v3.4s, v2.s[2] + fmla v19.4s, v3.4s, v2.s[3] + + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + + fmla v24.4s, v4.4s, v1.s[0] + fmla v25.4s, v4.4s, v1.s[1] + fmla v26.4s, v4.4s, v1.s[2] + fmla v27.4s, v4.4s, v1.s[3] + + fmla v28.4s, v4.4s, v2.s[0] + fmla v29.4s, v4.4s, v2.s[1] + fmla v30.4s, v4.4s, v2.s[2] + fmla v31.4s, v4.4s, v2.s[3] + + ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t) + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) // * sizeof(int16_t) + + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + shll v2.4s, v2.4h, #16 + + fmla v8.4s, v3.4s, v0.s[0] + fmla v9.4s, v3.4s, v0.s[1] + fmla v10.4s, v3.4s, v0.s[2] + fmla v11.4s, v3.4s, v0.s[3] + fmla v12.4s, v3.4s, v1.s[0] + fmla v13.4s, v3.4s, v1.s[1] + fmla v14.4s, v3.4s, v1.s[2] + fmla v15.4s, v3.4s, v1.s[3] + fmla v16.4s, v3.4s, v2.s[0] + fmla v17.4s, v3.4s, v2.s[1] + fmla v18.4s, v3.4s, v2.s[2] + fmla v19.4s, v3.4s, v2.s[3] + + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + + fmla v24.4s, v4.4s, v1.s[0] + fmla v25.4s, v4.4s, v1.s[1] + fmla v26.4s, v4.4s, v1.s[2] + fmla v27.4s, v4.4s, v1.s[3] + + fmla v28.4s, v4.4s, v2.s[0] + fmla v29.4s, v4.4s, v2.s[1] + fmla v30.4s, v4.4s, v2.s[2] + fmla v31.4s, v4.4s, v2.s[3] + sub x12, x12, #2 + cmp x12, #2 + bge LoopL2 + + cbz x12, LoopLEnd + + L1: + ld1 {v3.4h, v4.4h}, [x2], #16 // 8 * sizeof(int16_t) + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) // * sizeof(int16_t) + + shll v3.4s, v3.4h, #16 + shll v4.4s, v4.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + shll v2.4s, v2.4h, #16 + + fmla v8.4s, v3.4s, v0.s[0] + fmla v9.4s, v3.4s, v0.s[1] + fmla v10.4s, v3.4s, v0.s[2] + fmla v11.4s, v3.4s, v0.s[3] + fmla v12.4s, v3.4s, v1.s[0] + fmla v13.4s, v3.4s, v1.s[1] + fmla v14.4s, v3.4s, v1.s[2] + fmla v15.4s, v3.4s, v1.s[3] + fmla v16.4s, v3.4s, v2.s[0] + fmla v17.4s, v3.4s, v2.s[1] + fmla v18.4s, v3.4s, v2.s[2] + fmla v19.4s, v3.4s, v2.s[3] + + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + + fmla v24.4s, v4.4s, v1.s[0] + fmla v25.4s, v4.4s, v1.s[1] + fmla v26.4s, v4.4s, v1.s[2] + fmla v27.4s, v4.4s, v1.s[3] + + fmla v28.4s, v4.4s, v2.s[0] + fmla v29.4s, v4.4s, v2.s[1] + fmla v30.4s, v4.4s, v2.s[2] + fmla v31.4s, v4.4s, v2.s[3] + + LoopLEnd: + + add x2, x2, x7 // weight stride + sub x10, x10, #2 + cmp x10, #2 + + cbz x4, StoreLH8 + + AddBiasLH8: + ld1 {v0.4h, v1.4h}, [x5], #16 // 8 * sizeof(int16_t) + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + + fmla v8.4s, v0.4s, v5.s[1] + fmla v9.4s, v0.4s, v5.s[1] + fmla v10.4s, v0.4s, v5.s[1] + fmla v11.4s, v0.4s, v5.s[1] + + fmla v12.4s, v0.4s, v5.s[1] + fmla v13.4s, v0.4s, v5.s[1] + fmla v14.4s, v0.4s, v5.s[1] + fmla v15.4s, v0.4s, v5.s[1] + + fmla v16.4s, v0.4s, v5.s[1] + fmla v17.4s, v0.4s, v5.s[1] + fmla v18.4s, v0.4s, v5.s[1] + fmla v19.4s, v0.4s, v5.s[1] + + fmla v20.4s, v1.4s, v5.s[1] + fmla v21.4s, v1.4s, v5.s[1] + fmla v22.4s, v1.4s, v5.s[1] + fmla v23.4s, v1.4s, v5.s[1] + + fmla v24.4s, v1.4s, v5.s[1] + fmla v25.4s, v1.4s, v5.s[1] + fmla v26.4s, v1.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + + fmla v28.4s, v1.4s, v5.s[1] + fmla v29.4s, v1.4s, v5.s[1] + fmla v30.4s, v1.4s, v5.s[1] + fmla v31.4s, v1.4s, v5.s[1] + + PostTreatLH8: + fmax v8.4s, v8.4s, v6.4s + fmax v9.4s, v9.4s, v6.4s + fmax v10.4s, v10.4s, v6.4s + fmax v11.4s, v11.4s, v6.4s + fmax v12.4s, v12.4s, v6.4s + fmax v13.4s, v13.4s, v6.4s + fmax v14.4s, v14.4s, v6.4s + fmax v15.4s, v15.4s, v6.4s + fmax v16.4s, v16.4s, v6.4s + fmax v17.4s, v17.4s, v6.4s + fmax v18.4s, v18.4s, v6.4s + fmax v19.4s, v19.4s, v6.4s + fmax v20.4s, v20.4s, v6.4s + fmax v21.4s, v21.4s, v6.4s + fmax v22.4s, v22.4s, v6.4s + fmax v23.4s, v23.4s, v6.4s + fmax v24.4s, v24.4s, v6.4s + fmax v25.4s, v25.4s, v6.4s + fmax v26.4s, v26.4s, v6.4s + fmax v27.4s, v27.4s, v6.4s + fmax v28.4s, v28.4s, v6.4s + fmax v29.4s, v29.4s, v6.4s + fmax v30.4s, v30.4s, v6.4s + fmax v31.4s, v31.4s, v6.4s + + fmin v8.4s, v8.4s, v7.4s + fmin v9.4s, v9.4s, v7.4s + fmin v10.4s, v10.4s, v7.4s + fmin v11.4s, v11.4s, v7.4s + fmin v12.4s, v12.4s, v7.4s + fmin v13.4s, v13.4s, v7.4s + fmin v14.4s, v14.4s, v7.4s + fmin v15.4s, v15.4s, v7.4s + fmin v16.4s, v16.4s, v7.4s + fmin v17.4s, v17.4s, v7.4s + fmin v18.4s, v18.4s, v7.4s + fmin v19.4s, v19.4s, v7.4s + fmin v20.4s, v20.4s, v7.4s + fmin v21.4s, v21.4s, v7.4s + fmin v22.4s, v22.4s, v7.4s + fmin v23.4s, v23.4s, v7.4s + fmin v24.4s, v24.4s, v7.4s + fmin v25.4s, v25.4s, v7.4s + fmin v26.4s, v26.4s, v7.4s + fmin v27.4s, v27.4s, v7.4s + fmin v28.4s, v28.4s, v7.4s + fmin v29.4s, v29.4s, v7.4s + fmin v30.4s, v30.4s, v7.4s + fmin v31.4s, v31.4s, v7.4s + + StoreLH8: + + shrn v8.4h, v8.4s, #16 + shrn v9.4h, v9.4s, #16 + shrn v10.4h, v10.4s, #16 + shrn v11.4h, v11.4s, #16 + shrn v12.4h, v12.4s, #16 + shrn v13.4h, v13.4s, #16 + shrn v14.4h, v14.4s, #16 + shrn v15.4h, v15.4s, #16 + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + shrn v20.4h, v20.4s, #16 + shrn v21.4h, v21.4s, #16 + shrn v22.4h, v22.4s, #16 + shrn v23.4h, v23.4s, #16 + shrn v24.4h, v24.4s, #16 + shrn v25.4h, v25.4s, #16 + shrn v26.4h, v26.4s, #16 + shrn v27.4h, v27.4s, #16 + shrn v28.4h, v28.4s, #16 + shrn v29.4h, v29.4s, #16 + shrn v30.4h, v30.4s, #16 + shrn v31.4h, v31.4s, #16 + + stp d8, d9, [x0] + stp d10, d11, [x0, #(16 * 1)] // 2 * 4 * sizeof(int16_t) + stp d12, d13, [x0, #(16 * 2)] + stp d14, d15, [x0, #(16 * 3)] + stp d16, d17, [x0, #(16 * 4)] + stp d18, d19, [x0, #(16 * 5)] + add x0, x0, x13 // stp donot support post-index offset in register + stp d20, d21, [x0] + stp d22, d23, [x0, #(16 * 1)] + stp d24, d25, [x0, #(16 * 2)] + stp d26, d27, [x0, #(16 * 3)] + stp d28, d29, [x0, #(16 * 4)] + stp d30, d31, [x0, #(16 * 5)] + add x0, x0, x13 + + // st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x0], #32 // 16 * sizeof(int16_t) + // st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0], #32 // 16 * sizeof(int16_t) + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0], x14 + // st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [x0], #32 // 16 * sizeof(int16_t) + // st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [x0], #32 // 16 * sizeof(int16_t) + // st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [x0], x14 + + bge LoopH + +LH4: +cbz x10, End +LoopHRemain: + mov x15, x1 + subs x12, x9, #1 + ld1 {v3.4h}, [x2] + ld1 {v0.4h}, [x15], #8 + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + + fmul v8.4s, v3.4s, v0.s[0] + fmul v9.4s, v3.4s, v0.s[1] + add x2, x2, #16 // + ld1 {v1.4h}, [x15], #8 + shll v1.4s, v1.4h, #16 + + fmul v10.4s, v3.4s, v0.s[2] + fmul v11.4s, v3.4s, v0.s[3] + fmul v12.4s, v3.4s, v1.s[0] + + ld1 {v2.4h}, [x15], #8 + shll v2.4s, v2.4h, #16 + + fmul v13.4s, v3.4s, v1.s[1] + fmul v14.4s, v3.4s, v1.s[2] + fmul v15.4s, v3.4s, v1.s[3] + fmul v16.4s, v3.4s, v2.s[0] + fmul v17.4s, v3.4s, v2.s[1] + fmul v18.4s, v3.4s, v2.s[2] + fmul v19.4s, v3.4s, v2.s[3] + + beq LoopLREnd + + LoopLR: + ld1 {v3.4h}, [x2] + ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 // 12 * sizeof(int16_t) + shll v3.4s, v3.4h, #16 + shll v0.4s, v0.4h, #16 + shll v1.4s, v1.4h, #16 + shll v2.4s, v2.4h, #16 + + fmla v8.4s, v3.4s, v0.s[0] + fmla v9.4s, v3.4s, v0.s[1] + fmla v10.4s, v3.4s, v0.s[2] + fmla v11.4s, v3.4s, v0.s[3] + add x2, x2, #16 // + fmla v12.4s, v3.4s, v1.s[0] + fmla v13.4s, v3.4s, v1.s[1] + fmla v14.4s, v3.4s, v1.s[2] + fmla v15.4s, v3.4s, v1.s[3] + fmla v16.4s, v3.4s, v2.s[0] + fmla v17.4s, v3.4s, v2.s[1] + fmla v18.4s, v3.4s, v2.s[2] + fmla v19.4s, v3.4s, v2.s[3] + + subs x12, x12, #1 + bne LoopLR + LoopLREnd: + + cbz x4, StoreLH4 + AddBiasLH4: + ld1 {v0.4h}, [x5], #8 + shll v0.4s, v0.4h, #16 + + fmla v8.4s, v0.4s, v5.s[1] + fmla v9.4s, v0.4s, v5.s[1] + fmla v10.4s, v0.4s, v5.s[1] + fmla v11.4s, v0.4s, v5.s[1] + + fmla v12.4s, v0.4s, v5.s[1] + fmla v13.4s, v0.4s, v5.s[1] + fmla v14.4s, v0.4s, v5.s[1] + fmla v15.4s, v0.4s, v5.s[1] + + fmla v16.4s, v0.4s, v5.s[1] + fmla v17.4s, v0.4s, v5.s[1] + fmla v18.4s, v0.4s, v5.s[1] + fmla v19.4s, v0.4s, v5.s[1] + + PostTreatLH4: + fmax v8.4s, v8.4s, v6.4s + fmax v9.4s, v9.4s, v6.4s + fmax v10.4s, v10.4s, v6.4s + fmax v11.4s, v11.4s, v6.4s + fmax v12.4s, v12.4s, v6.4s + fmax v13.4s, v13.4s, v6.4s + fmax v14.4s, v14.4s, v6.4s + fmax v15.4s, v15.4s, v6.4s + fmax v16.4s, v16.4s, v6.4s + fmax v17.4s, v17.4s, v6.4s + fmax v18.4s, v18.4s, v6.4s + fmax v19.4s, v19.4s, v6.4s + + fmin v8.4s, v8.4s, v7.4s + fmin v9.4s, v9.4s, v7.4s + fmin v10.4s, v10.4s, v7.4s + fmin v11.4s, v11.4s, v7.4s + fmin v12.4s, v12.4s, v7.4s + fmin v13.4s, v13.4s, v7.4s + fmin v14.4s, v14.4s, v7.4s + fmin v15.4s, v15.4s, v7.4s + fmin v16.4s, v16.4s, v7.4s + fmin v17.4s, v17.4s, v7.4s + fmin v18.4s, v18.4s, v7.4s + fmin v19.4s, v19.4s, v7.4s + + StoreLH4: + + shrn v8.4h, v8.4s, #16 + shrn v9.4h, v9.4s, #16 + shrn v10.4h, v10.4s, #16 + shrn v11.4h, v11.4s, #16 + shrn v12.4h, v12.4s, #16 + shrn v13.4h, v13.4s, #16 + shrn v14.4h, v14.4s, #16 + shrn v15.4h, v15.4s, #16 + shrn v16.4h, v16.4s, #16 + shrn v17.4h, v17.4s, #16 + shrn v18.4h, v18.4s, #16 + shrn v19.4h, v19.4s, #16 + + stp d8, d9, [x0] + stp d10, d11, [x0, #(16 * 1)] + stp d12, d13, [x0, #(16 * 2)] + stp d14, d15, [x0, #(16 * 3)] + stp d16, d17, [x0, #(16 * 4)] + stp d18, d19, [x0, #(16 * 5)] + + // st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [x0], #32 + // st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [x0], #32 + // st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [x0] + + sub x10, x10, #1 + + +End: +// sub sp, sp, #128 +// ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 +// ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 +ldp d8, d9, [sp, #-16] +ldp d10, d11, [sp, #-32] +ldp d12, d13, [sp, #-48] +ldp d14, d15, [sp, #-64] + + +ret + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S b/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S index 07456669..8de9fc5e 100644 --- a/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S +++ b/source/backend/cpu/arm/arm64/MNNTranspose32Bit4x4.S @@ -31,7 +31,13 @@ lsr x5, x5, #2 // x6, x7 -> srcStride * sizeof(float), dstStride * sizeof(float) lsl x6, x6, #2 lsl x7, x7, #2 + +// [x0, x1, x2, x3] => [x0, x6, x2, x3] .macro transpose_4x4 x0, x1, x2, x3, x5, x6 +// x0: [00,01,02,03] \ x5:[00,10,02,12] \ x0:[00,10,20,30] +// x1: [10,11,12,13] ===\ x1:[01,11,03,13] ===\ x6:[01,11,21,31] +// x2: [20,21,22,23] ===/ x6:[20,30,22,32] ===/ x2:[02,12,22,32] +// x3: [30,31,32,33] / x3:[21,31,23,33] / x3:[03,13,23,33] trn1 \x5\().4s, \x0\().4s, \x1\().4s trn2 \x1\().4s, \x0\().4s, \x1\().4s trn1 \x6\().4s, \x2\().4s, \x3\().4s diff --git a/source/backend/cpu/arm/arm64/MNNUnPackC4_BF16.S b/source/backend/cpu/arm/arm64/MNNUnPackC4_BF16.S new file mode 100644 index 00000000..72e0aa7d --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNUnPackC4_BF16.S @@ -0,0 +1,167 @@ +// +// NEON_MNNUnPackC4_BF16.S +// MNN +// +// Created by MNN on 2019/02/02. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __aarch64__ +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function NEON_MNNUnpackC4_BF16 +// treate float pointer as int16_t* +//void NEON_MNNUnpackC4_BF16(float* dst, const float* src, size_t area, size_t depth) +//Auto load: +//x0:dst, x1:src, x2:area, x3:depth +mul x4, x2, x3 +cmp x4, #0 +beq DownEnd + +//Swap x0, x1 +mov x4, x0 +mov x0, x1 +mov x1, x4 + +//x4: srcDepthOffset:area * sizeof(int16_t) +mov x4, #2 // sizeof(int16_t) +mul x4, x2, x4 + +DownL4: +cmp x3, #3 +ble DownL3 + +DownL4Loop: +add x5, x1, x4 +add x6, x4, x5 +add x7, x4, x6 +mov x8, x2 +cmp x8, #3 +ble DownL4AreaRemain +DownL4AreaLoop: +ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) +st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t) +st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t) +st1 {v3.4h}, [x7], #8 // 4 * sizeof(int16_t) +sub x8, x8, #4 +cmp x8, #4 +bge DownL4AreaLoop + +DownL4AreaRemain: +cmp x8, #0 +beq DownL4AreaRemainEnd +DownL4AreaRemainLoop: +ld1 {v0.4h}, [x0], #8 +st1 {v0.h}[0], [x1], #2 +st1 {v0.h}[1], [x5], #2 +st1 {v0.h}[2], [x6], #2 +st1 {v0.h}[3], [x7], #2 + + +subs x8, x8, #1 +bne DownL4AreaRemainLoop +DownL4AreaRemainEnd: +sub x3, x3, #4 +mov x1, x7 +cmp x3, #4 +bge DownL4Loop + +DownL3: +cmp x3, #2 +ble DownL2 +add x5, x1, x4 +add x6, x4, x5 +mov x8, x2 +cmp x8, #3 +ble DownL3AreaRemain +DownL3AreaLoop: +ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) +st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t) +st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t) +sub x8, x8, #4 +cmp x8, #4 +bge DownL3AreaLoop + +cmp x8, #0 +beq DownL3AreaRemainEnd +DownL3AreaRemain: +ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) +st1 {v0.h}[0], [x1], #2 // sizeof(int16_t) +st1 {v0.h}[1], [x5], #2 // sizeof(int16_t) +st1 {v0.h}[2], [x6], #2 // sizeof(int16_t) + +subs x8, x8, #1 +bne DownL3AreaRemain + +DownL3AreaRemainEnd: +sub x3, x3, #3 + + +DownL2: +cmp x3, #1 +ble DownL1 +add x5, x1, x4 +mov x8, x2 +cmp x8, #3 +ble DownL2AreaRemain +DownL2AreaLoop: +ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +st1 {v0.4h}, [x1], #8 +st1 {v1.4h}, [x5], #8 + +sub x8, x8, #4 +cmp x8, #4 +bge DownL2AreaLoop + +cmp x8, #0 +beq DownL2AreaRemainEnd +DownL2AreaRemain: +ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) +st1 {v0.h}[0], [x1], #2 +st1 {v0.h}[1], [x5], #2 + +subs x8, x8, #1 +bne DownL2AreaRemain + +DownL2AreaRemainEnd: +sub x3, x3, #2 + +DownL1: +cmp x3, #0 +beq DownEnd +mov x8, x2 +cmp x8, #3 +ble DownL1AreaRemain +DownL1AreaLoop: +ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) +st1 {v0.4h}, [x1], #8 + +sub x8, x8, #4 +cmp x8, #4 +bge DownL1AreaLoop + +cmp x8, #0 +beq DownL1AreaRemainEnd +DownL1AreaRemain: +movi v0.4h, #0 +ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) +st1 {v0.h}[0], [x1], #2 + + +subs x8, x8, #1 +bne DownL1AreaRemain + +DownL1AreaRemainEnd: + +DownEnd: + +ret + + +#endif + diff --git a/source/backend/cpu/bf16/BF16Backend.cpp b/source/backend/cpu/bf16/BF16Backend.cpp new file mode 100644 index 00000000..dcfd5df9 --- /dev/null +++ b/source/backend/cpu/bf16/BF16Backend.cpp @@ -0,0 +1,171 @@ +// +// BF16Backend.cpp +// MNN +// +// Created by MNN on 2020/01/26. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include + +#include "BF16Functions.hpp" +#include "BF16Backend.hpp" +#include "core/BufferAllocator.hpp" +#include "core/TensorUtils.hpp" +#include "backend/cpu/CPUTensorConvert.hpp" +#include "core/OpCommonUtils.hpp" +namespace MNN { + +void registerBF16Ops(); +static std::map* gInstance = nullptr; +// The Function Will be Called in init +extern void registerBF16Backend() { + gInstance = new std::map; + bool success = BF16Functions::init(); + if (success) { + registerBF16Ops(); + } +} +bool BF16Backend::addBF16Creator(OpType t, BF16Creator* ct) { + auto creatorContainer = gInstance; + if (creatorContainer->find(t) == creatorContainer->end()) { + creatorContainer->insert(std::make_pair(t, ct)); + } + return true; +} + +BF16Backend::BF16Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) { + mCoreFunctions = BF16Functions::get(); +} + +BF16Backend::~BF16Backend() { + // nothing to do +} + +Execution* BF16Backend::onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op) { + for (auto t : outputs) { + if (t->getType().code != halide_type_float) { + return nullptr; + } + } + auto quantInfo = OpCommonUtils::getQuantInfo(inputs); + if (quantInfo.first) { + return nullptr; + } + bool originCreate = OpCommonUtils::opCompabilityForLowp(op); + if (originCreate) { + return CPUBackend::onCreate(inputs, outputs, op); + } + auto creatorContainer = gInstance; + auto iter = creatorContainer->find(op->type()); + + if (iter == creatorContainer->end()) { + return nullptr; + } + auto exe = iter->second->onCreate(inputs, outputs, op, this); + if (exe == nullptr) { + return nullptr; + } + return exe; +} + +static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) { + // The default data type of input tensor for arm82 backend is FLOAT32. + // However, BF16Backend default data type is FLOAT16, so check whether data type is FLOAT32, + // then divide size by 2 + int size = sizeof(int16_t); + const int dimensions = buffer.dimensions; + for (int i = 0; i < dimensions; i++) { + int currentDimSize = buffer.dim[i].extent; + if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) { + currentDimSize = ALIGN_UP4(currentDimSize); + } + size *= currentDimSize; + } + return size; +} + +bool BF16Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) { + // arm82 backend tensor data type is fp16 default + auto tensor = const_cast(nativeTensor); + auto& buffer = tensor->buffer(); + if (buffer.type != halide_type_of()) { + return CPUBackend::onAcquireBuffer(nativeTensor, storageType); + } + auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType); + if (!res) { + return false; + } + // Set mask in device for easy to determine + buffer.device = 1; + return true; +} + +void BF16Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const { + auto& ib = srcTensor->buffer(); + auto& ob = dstTensor->buffer(); + if (ib.type.code != halide_type_float) { + CPUBackend::onCopyBuffer(srcTensor, dstTensor); + return; + } + auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat; + auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat; + auto srcType = MNN_FORWARD_CPU; + if (ib.device != 0) { + srcType = MNN_FORWARD_CPU_EXTENSION; + } + auto dstType = MNN_FORWARD_CPU; + if (ob.device != 0) { + dstType = MNN_FORWARD_CPU_EXTENSION; + } + if (srcType == dstType) { + ErrorCode code = ErrorCode::NO_ERROR; + auto tup = CPUTensorConverter::splitDimensions(srcTensor->buffer(), source); + int area = std::get<1>(tup), batch = std::get<0>(tup), channel = std::get<2>(tup); + if (srcType == MNN_FORWARD_CPU) { + code = CPUTensorConverter::convert(srcTensor->host(), dstTensor->host(), source, dest, batch, area, channel, 4); + } else { + code = CPUTensorConverter::convert(srcTensor->host(), dstTensor->host(), source, dest, batch, area, channel, 2); + } + MNN_ASSERT(code == ErrorCode::NO_ERROR); + return; + } + // Use CPU Copy to turn save format + std::shared_ptr tempTensor; + if (source != dest) { + if (srcType == MNN_FORWARD_CPU) { + tempTensor.reset(Tensor::create(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor))); + MNNCPUCopyBuffer(srcTensor, tempTensor.get()); + srcTensor = tempTensor.get(); + source = dest; + } else { + tempTensor.reset(Tensor::create(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) { + auto tempT = (Tensor*)ptr; + MNNCPUCopyBuffer(tempT, dstTensor); + delete tempT; + }); + dstTensor = tempTensor.get(); + dest = source; + } + } + //MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType); + // The format is the same, just convert fp32-fp16 + const int elemenSize = srcTensor->elementSize(); + // copy and quantize/dequantize data + if (srcType == MNN_FORWARD_CPU) { + const auto src = srcTensor->host(); + auto dst = dstTensor->host(); + BF16Functions::get()->MNNFp32ToLowp(src, dst, elemenSize); + return; + } + if (srcType == MNN_FORWARD_CPU_EXTENSION) { + const auto src = srcTensor->host(); + auto dst = dstTensor->host(); + BF16Functions::get()->MNNLowpToFp32(src, dst, elemenSize); + return; + } + return; +} + +} // namespace MNN diff --git a/source/backend/cpu/bf16/BF16Backend.hpp b/source/backend/cpu/bf16/BF16Backend.hpp new file mode 100644 index 00000000..c3800e16 --- /dev/null +++ b/source/backend/cpu/bf16/BF16Backend.hpp @@ -0,0 +1,46 @@ +// +// BF16Backend.hpp +// MNN +// +// Created by MNN on 2020/01/26. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifndef BF16Backend_hpp +#define BF16Backend_hpp + +#include "backend/cpu/CPUBackend.hpp" +#include "core/Macro.h" +#include "core/TensorUtils.hpp" + +namespace MNN { +class BF16Backend : public CPUBackend { +public: + virtual ~BF16Backend(); + BF16Backend(const CPURuntime* runtime); + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op) override; + virtual bool onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) override; + + virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override; + + int numberThread() const { + return threadNumber(); + } +public: + class BF16Creator { + public: + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op, Backend* backend) const = 0; + }; + + static bool addBF16Creator(OpType t, BF16Creator* ct); +}; + +#define REGISTER_BF16_OP_CREATOR(type, creator) \ + void ___##type##__##creator##__() { \ + BF16Backend::addBF16Creator(type, new creator); \ + } + +} // namespace MNN + +#endif /* BF16Backend_hpp */ diff --git a/source/backend/cpu/bf16/BF16Binary.cpp b/source/backend/cpu/bf16/BF16Binary.cpp new file mode 100644 index 00000000..f0e4acc7 --- /dev/null +++ b/source/backend/cpu/bf16/BF16Binary.cpp @@ -0,0 +1,339 @@ +// +// BF16Binary.cpp +// MNN +// +// Created by MNN on 2021/02/07. +// Copyright © 2021, Alibaba Group Holding Limited +// + +#include +#include "backend/cpu/BinaryUtils.hpp" +#include "core/Macro.h" +#include "core/Execution.hpp" +#include "VecHalf.hpp" +#include "math/Vec.hpp" +#include "BF16Backend.hpp" +#include "BF16Functions.hpp" +using Vec4Half = MNN::Math::VecHalf<4>; +using Vec4 = MNN::Math::Vec; +namespace MNN { + +class BF16BinaryFloat : public Execution { +public: + BF16BinaryFloat(Backend *b, int32_t type); + virtual ~BF16BinaryFloat() = default; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + +protected: + int32_t mType; + int mNeedBroadcastIndex; // -1 do not need broadcast, 0 for input0, 1 for input1 + int mTotalSize = 0; +}; +template +void BF16BinaryWrap(int16_t *dst, const int16_t *src0, const int16_t *src1, const int elementSize, const int needBroadcastIndex) { + Func compute; + const int sizeDivUnit = elementSize / 4; + const int remainCount = elementSize - sizeDivUnit * 4; + + float A[4]; + float B[4]; + float C[4]; + if (-1 == needBroadcastIndex) { + if (sizeDivUnit > 0) { + for (int i = 0; i < sizeDivUnit; ++i) { + const auto src0Ptr = src0; + const auto src1Ptr = src1; + auto dstPtr = dst; + Vec4::save(A, Vec4(std::move(Vec4Half::load(src0Ptr).value))); + Vec4::save(B, Vec4(std::move(Vec4Half::load(src1Ptr).value))); + for (int v = 0; v < 4; ++ v) { + C[v] = compute(A[v], B[v]); + } + Vec4Half::save(dstPtr, Vec4Half(std::move(Vec4::load(C).value))); + src0 += 4; + src1 += 4; + dst += 4; + } + } + if (remainCount > 0) { + int16_t tempSrc0[4]; + int16_t tempSrc1[4]; + int16_t tempDst[4]; + ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t)); + ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t)); + Vec4::save(A, Vec4(std::move(Vec4Half::load(tempSrc0).value))); + Vec4::save(B, Vec4(std::move(Vec4Half::load(tempSrc1).value))); + for (int v = 0; v < remainCount; ++ v) { + C[v] = compute(A[v], B[v]); + } + Vec4Half::save(tempDst, Vec4Half(std::move(Vec4::load(C).value))); + ::memcpy(dst, tempDst, remainCount * sizeof(int16_t)); + } + } else if (0 == needBroadcastIndex) { + const int16_t srcValue016 = src0[0]; + float srcValue0; + BF16Functions::get()->MNNLowpToFp32(&srcValue016, &srcValue0, 1); + auto a = Vec4Half(srcValue0); + if (sizeDivUnit > 0) { + for (int i = 0; i < sizeDivUnit; ++i) { + const auto src1Ptr = src1; + auto dstPtr = dst; + Vec4::save(B, Vec4(std::move(Vec4Half::load(src1Ptr).value))); + for (int v = 0; v < 4; ++ v) { + C[v] = compute(A[v], B[v]); + } + Vec4Half::save(dstPtr, Vec4Half(std::move(Vec4::load(C).value))); + src1 += 4; + dst += 4; + } + } + if (remainCount > 0) { + int16_t tempSrc1[4]; + int16_t tempDst[4]; + ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t)); + Vec4::save(B, Vec4(std::move(Vec4Half::load(tempSrc1).value))); + for (int v = 0; v < remainCount; ++ v) { + C[v] = compute(srcValue0, B[v]); + } + Vec4Half::save(tempDst, Vec4Half(std::move(Vec4::load(C).value))); + ::memcpy(dst, tempDst, remainCount * sizeof(int16_t)); + } + } else { + const int16_t srcValue116 = src1[0]; + float srcValue1; + BF16Functions::get()->MNNLowpToFp32(&srcValue116, &srcValue1, 1); + auto b = Vec4Half(srcValue1); + if (sizeDivUnit > 0) { + for (int i = 0; i < sizeDivUnit; ++i) { + const auto src0Ptr = src0; + auto dstPtr = dst; + Vec4::save(A, Vec4(std::move(Vec4Half::load(src0Ptr).value))); + for (int v = 0; v < 4; ++ v) { + C[v] = compute(A[v], B[v]); + } + Vec4Half::save(dstPtr, Vec4Half(std::move(Vec4::load(C).value))); + src0 += 4; + dst += 4; + } + } + if (remainCount > 0) { + int16_t tempSrc0[4]; + int16_t tempDst[4]; + ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t)); + Vec4::save(A, Vec4(std::move(Vec4Half::load(tempSrc0).value))); + for (int v = 0; v < remainCount; ++ v) { + C[v] = compute(A[v], srcValue1); + } + Vec4Half::save(tempDst, Vec4Half(std::move(Vec4::load(C).value))); + ::memcpy(dst, tempDst, remainCount * sizeof(int16_t)); + } + } +} + + +template +void BF16Binary(int16_t *dst, const int16_t *src0, const int16_t *src1, const int elementSize, const int needBroadcastIndex) { + Func compute; + const int sizeDivUnit = elementSize / 4; + const int remainCount = elementSize - sizeDivUnit * 4; + + if (-1 == needBroadcastIndex) { + if (sizeDivUnit > 0) { + for (int i = 0; i < sizeDivUnit; ++i) { + Vec4Half a = Vec4Half::load(src0); + Vec4Half b = Vec4Half::load(src1); + Vec4Half::save(dst, compute(a, b)); + src0 += 4; + src1 += 4; + dst += 4; + } + } + if (remainCount > 0) { + int16_t tempSrc0[4]; + int16_t tempSrc1[4]; + int16_t tempDst[4]; + ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t)); + ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t)); + Vec4Half a = Vec4Half::load(tempSrc0); + Vec4Half b = Vec4Half::load(tempSrc1); + Vec4Half::save(tempDst, compute(a, b)); + ::memcpy(dst, tempDst, remainCount * sizeof(int16_t)); + } + } else if (0 == needBroadcastIndex) { + const int16_t srcValue016 = src0[0]; + float srcValue0; + BF16Functions::get()->MNNLowpToFp32(&srcValue016, &srcValue0, 1); + Vec4Half a = Vec4Half(srcValue0); + if (sizeDivUnit > 0) { + for (int i = 0; i < sizeDivUnit; ++i) { + const auto src1Ptr = src1; + auto dstPtr = dst; + Vec4Half b = Vec4Half::load(src1Ptr); + Vec4Half::save(dstPtr, compute(a, b)); + src1 += 4; + dst += 4; + } + } + if (remainCount > 0) { + int16_t tempSrc1[8]; + int16_t tempDst[8]; + ::memcpy(tempSrc1, src1, remainCount * sizeof(int16_t)); + Vec4Half b = Vec4Half::load(tempSrc1); + Vec4Half::save(tempDst, compute(a, b)); + ::memcpy(dst, tempDst, remainCount * sizeof(int16_t)); + } + } else { + const int16_t srcValue116 = src1[0]; + float srcValue1; + BF16Functions::get()->MNNLowpToFp32(&srcValue116, &srcValue1, 1); + Vec4Half b = Vec4Half(srcValue1); + if (sizeDivUnit > 0) { + for (int i = 0; i < sizeDivUnit; ++i) { + const auto src0Ptr = src0; + auto dstPtr = dst; + Vec4Half a = Vec4Half::load(src0Ptr); + Vec4Half::save(dstPtr, compute(a, b)); + src0 += 4; + dst += 4; + } + } + if (remainCount > 0) { + int16_t tempSrc0[8]; + int16_t tempDst[8]; + ::memcpy(tempSrc0, src0, remainCount * sizeof(int16_t)); + Vec4Half a = Vec4Half::load(tempSrc0); + Vec4Half::save(tempDst, compute(a, b)); + ::memcpy(dst, tempDst, remainCount * sizeof(int16_t)); + } + } +} + + +struct VecBinaryAdd : std::binary_function { + Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const { + return x + y; + } +}; + +struct VecBinarySub : std::binary_function { + Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const { + return x - y; + } +}; + +struct VecBinaryMul : std::binary_function { + Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const { + return x * y; + } +}; + +struct VecBinaryMin : std::binary_function { + Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const { + return Vec4Half::min(x, y); + } +}; + +struct VecBinaryMax : std::binary_function { + Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const { + return Vec4Half::max(x, y); + } +}; + +struct VecBinarySqd : std::binary_function { + Vec4Half operator()(const Vec4Half& x, const Vec4Half& y) const { + return (x-y)*(x-y); + } +}; + +BF16BinaryFloat::BF16BinaryFloat(Backend *backend, int32_t type):Execution(backend), mType(type) { + // Do nothing +} + +ErrorCode BF16BinaryFloat::onResize(const std::vector &inputs, const std::vector &outputs) { + MNN_ASSERT(1 == outputs.size()); + const int input0DataCount = inputs[0]->elementSize(); + const int input1DataCount = inputs[1]->elementSize(); + if (input1DataCount == input0DataCount) { + mNeedBroadcastIndex = -1; + mTotalSize = input1DataCount; + } else if (input0DataCount == 1) { + mNeedBroadcastIndex = 0; + mTotalSize = input1DataCount; + } else { + mNeedBroadcastIndex = 1; + mTotalSize = input0DataCount; + } + return NO_ERROR; +} + +ErrorCode BF16BinaryFloat::onExecute(const std::vector &inputs, const std::vector &outputs){ + auto input0 = inputs[0]; + auto input1 = inputs[1]; + auto output = outputs[0]; + + const auto src0 = input0->host(); + const auto src1 = input1->host(); + auto dst = output->host(); + + switch (mType) { + case BinaryOpOperation_ADD: + BF16Binary(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_SUB: + BF16Binary(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_MUL: + BF16Binary(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_MINIMUM: + BF16Binary(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_MAXIMUM: + BF16Binary(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_SquaredDifference: + BF16Binary(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_REALDIV: + BF16BinaryWrap>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_FLOORDIV: + BF16BinaryWrap>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_FLOORMOD: + BF16BinaryWrap>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_POW: + BF16BinaryWrap>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_ATAN2: + BF16BinaryWrap>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + case BinaryOpOperation_MOD: + BF16BinaryWrap>(dst, src0, src1, mTotalSize, mNeedBroadcastIndex); + break; + default: + return NOT_SUPPORT; + break; + } + return NO_ERROR; +} + +class BF16BinaryCreator : public BF16Backend::BF16Creator { + virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, + const MNN::Op *op, Backend *backend) const override { + int32_t type = op->main_as_BinaryOp()->opType(); + auto dataType = outputs[0]->getType(); + if (dataType.code != halide_type_float) { + return nullptr; + } + return new BF16BinaryFloat(backend, type); + } +}; + +REGISTER_BF16_OP_CREATOR(OpType_BinaryOp, BF16BinaryCreator); + + + +} // namespace MNN diff --git a/source/backend/cpu/bf16/BF16Functions.cpp b/source/backend/cpu/bf16/BF16Functions.cpp new file mode 100644 index 00000000..3b4ca62a --- /dev/null +++ b/source/backend/cpu/bf16/BF16Functions.cpp @@ -0,0 +1,588 @@ +#ifdef MNN_USE_SSE +#include "../x86_x64/sse/FunctionSummary.hpp" +#include "../x86_x64/avx/FunctionSummary.hpp" +#include "../x86_x64/avxfma/FunctionSummary.hpp" +#include "../x86_x64/avx512/FunctionSummary.hpp" +#include "../x86_x64/cpu_id.h" +#endif + +#if defined(MNN_USE_NEON) +#include "../arm/FunctionSummary.hpp" +#endif + +#include "BF16Functions.hpp" +#include "WinogradOptFunctionHalf.hpp" +#include "../compute/CommonOptFunction.h" +#include "VecHalf.hpp" +#include "math/Vec.hpp" +using BFVec4 = MNN::Math::VecHalf<4>; +using Vec4 = MNN::Math::Vec; +namespace MNN { +// just for reference BF16 converting of c++ code, not for arm or sse. +inline int16_t MNNFP32ToBF16(float fp32Value) { + int32_t* s32Value = (int32_t*)(&fp32Value); + return (int16_t)((*s32Value) >> 16); +} +inline float MNNLowpToFp32(int16_t s16Value) { + int32_t s32Value = ((int32_t)s16Value) << 16; + float* fp32Value = (float*)(&s32Value); + return *fp32Value; +} + +static void _MNNFp32ToLowp(const float* src, int16_t* dst, size_t size) { + int sizeC4 = size / 4; + for (int i = 0; i < sizeC4; ++i) { + auto srcV = Vec4::load(src); + auto dstV = BFVec4(std::move(srcV.value)); + BFVec4::save(dst, dstV); + src+=4; + dst+=4; + } + int sizeRemain = size % 4; + if (sizeRemain > 0) { + float srcTemp[4]; + int64_t dstTemp[1]; + ::memcpy(srcTemp, src, sizeRemain * sizeof(float)); + auto srcV = Vec4::load(srcTemp); + auto dstV = BFVec4(std::move(srcV.value)); + BFVec4::save((int16_t*)dstTemp, dstV); + ::memcpy(dst, dstTemp, sizeRemain * sizeof(int16_t)); + } +} +static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) { + int sizeC4 = size / 4; + for (int i = 0; i < sizeC4; ++i) { + auto srcV = BFVec4::load(src); + auto dstV = Vec4(std::move(srcV.value)); + Vec4::save(dst, dstV); + src+=4; + dst+=4; + } + int sizeRemain = size % 4; + if (sizeRemain > 0) { + int64_t srcTemp[2]; + float dstTemp[4]; + ::memcpy(srcTemp, src, sizeRemain * sizeof(int16_t)); + auto srcV = BFVec4::load((int16_t*)srcTemp); + auto dstV = Vec4(std::move(srcV.value)); + Vec4::save(dstTemp, dstV); + ::memcpy(dst, dstTemp, sizeRemain * sizeof(float)); + } +} +static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, + size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { + int fx, fy; + BFVec4 dstValue(0.0f); + const int16_t* src_z = (const int16_t*)src; + const int16_t* weight_z = (const int16_t*)weight; + for (fy = 0; fy < fh; ++fy) { + const auto src_y = src_z + fy * dilateY_step; + const auto weight_y = weight_z + fy * weight_y_step; + for (fx = 0; fx < fw; ++fx) { + const auto weight_x = weight_y + 4 * fx; + const auto src_x = src_y + fx * dilateX_step; + dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x); + } + } + BFVec4::save((int16_t*)dst, dstValue); +} + +static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, + size_t srcHStep, size_t dstHStep) { + int dx, fx, fy; + auto dst = (int16_t*)dstO; + auto src = (const int16_t*)srcO; + auto weight = (const int16_t*)weightO; + for (int y = 0; y < height; ++y) { + auto srcY = src + y * srcHStep; + auto dstY = dst + y * dstHStep; + for (dx = 0; dx < width; ++dx) { + auto dst_x = dstY + dx * 4; + BFVec4 dstValue(0.0f); + const auto src_z = srcY + src_w_setup * dx; + const auto weight_z = weight; + for (fy = 0; fy < fh; ++fy) { + const auto src_y = src_z + fy * dilateY_step; + const auto weight_y = weight_z + fy * fw * 4; + for (fx = 0; fx < fw; ++fx) { + const auto weight_x = weight_y + 4 * fx; + const auto src_x = src_y + fx * dilateX_step; + dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x); + } + } + BFVec4::save(dst_x, dstValue); + } + } +} +void MNNAxByClampBroadcastUnitBF16(float* CF, const float* AF, const float* BF, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) { + auto C = (int16_t*)CF; + auto A = (const int16_t*)AF; + auto B = (const int16_t*)BF; + auto minF = BFVec4(parameters[2]); + auto maxF = BFVec4(parameters[3]); + auto beta = BFVec4(parameters[1]); + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y; + auto b = B + 4 * y; + auto bv = BFVec4::load(b); + auto c = C + cStride * y; + for (int x = 0; x < width; ++x) { + auto av = BFVec4::load(a + 4 * x); + auto cv = av + bv * beta; + cv = BFVec4::min(cv, maxF); + cv = BFVec4::max(cv, minF); + BFVec4::save(c + 4 * x, cv); + } + } +} + +#if !defined(MNN_USE_SSE) && !defined(MNN_USE_NEON) +void MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) { + MNNPackC4ForMatMul_A(destOrigin, sourceGroup, info, el); + return; +} + +void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose) { + MNNPackForMatMul_B_Template((int16_t*)dest, (const int16_t*)source, h, l, transpose); + return; +} +#endif + +void MNNPackedMatMulRemain_BF16(float* CFloat, const float* AFloat, const float* BFloat, size_t eSize, + const size_t* parameter, float* cacheFloat, const float* postParameters, + const float* biasFloat) { + int16_t* C = (int16_t*)CFloat; + int16_t* A = (int16_t*)AFloat; + int16_t* B = (int16_t*)BFloat; + int16_t* cache = (int16_t*)cacheFloat; + int16_t* bias = (int16_t*)biasFloat; + auto h = parameter[2]; + auto l = parameter[1]; + auto cStride = parameter[3] / sizeof(int16_t); + auto hRemain = parameter[4]; + auto bExtraStride = parameter[5] / sizeof(int16_t); + auto bStride = bExtraStride + l * 6; + auto hC4 = UP_DIV(h, 4); + for (int y = 0; y < hC4; ++y) { + ::memset(C + y * cStride, 0, eSize * 4 * sizeof(int16_t)); + } + float alpha = 1.0f; + float beta = 0.0f; + float minValue = -std::numeric_limits().max(); + float maxValue = std::numeric_limits().max(); + if (nullptr != postParameters) { + minValue = postParameters[2]; + maxValue = postParameters[3]; + alpha = postParameters[0]; + beta = postParameters[1]; + } + + for (int x = 0; x < eSize; ++x) { + auto dst = C + 4 * x; + auto src = + A + x; // input data is packed as tileCount x l x 16, is only one tiled block here, indexed as A[z * 16 + x] + for (int ry = 0; ry < h; ++ry) { + auto y = ry / 4; + auto yRemain = ry % 4; + auto bY = B + y * bStride; + auto dstY = dst + y * cStride; // convert NCHW to NC4HW4 ie 1·(y/4)·X·4 + int wdy = ry / 6; + int wdyRemain = ry % 6; + auto weight = + B + wdy * bStride + + wdyRemain; // weight is packed as (h/6) x l x 6, indexed as B[(ry / 6) * Bstride +z*6 + (ry % 6)] + float summer = 0.0f; + for (int z = 0; z < l; ++z) { + auto aZ = src + z * 16; + auto wZ = weight + z * 6; + summer += MNNLowpToFp32(wZ[0]) * MNNLowpToFp32(aZ[0]); + } + float originValue = MNNLowpToFp32(dstY[yRemain]); + if (nullptr != bias) { + originValue = MNNLowpToFp32(bias[ry]); + } + auto dstValue = originValue * beta + alpha * summer; + dstValue = std::min(dstValue, maxValue); + dstValue = std::max(dstValue, minValue); + dstY[yRemain] = MNNFP32ToBF16(dstValue); + } + } +} + +void MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, float* cache, + const float* postParameters, const float* bias) { + return MNNPackedMatMulRemain_BF16(C, A, B, 16, parameter, cache, postParameters, bias); + // return _AVX_MNNPackedMatMulFMA(C, A, B, parameter, cache); +} + + +static void _MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow); + +static void _MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigthF, float *destF, int cacheLineSize, int ow) { + auto weigth = (const int16_t*)weigthF; + auto dest = (int16_t*)destF; + int unit = ow / 2; + MNN_ASSERT(cacheLineSize >= 1); + for (int x = 0; x < unit; ++x) { + auto offset = 4 * 4 * x; + int i = 0; + BFVec4 m0 = BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0); + BFVec4 m1 = BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1); + BFVec4 m2 = BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2); + BFVec4 m3 = BFVec4::load(weigth + i * 16 + 4 * 3) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 3); + + for (i = 1; i < cacheLineSize; ++i) { + m0 = m0 + BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0); + m1 = m1 + BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1); + m2 = m2 + BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2); + m3 = m3 + BFVec4::load(weigth + i * 16 + 4 * 3) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 3); + } + + auto o0 = m0 + m1 + m2; + auto o1 = m1 - m2 + m3; + BFVec4::save(dest + 8 * x + 0 * 4, o0); + BFVec4::save(dest + 8 * x + 1 * 4, o1); + } + if (unit * 2 < ow) { + auto offset = 4 * 4 * unit; + int i = 0; + BFVec4 m0 = BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0); + BFVec4 m1 = BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1); + BFVec4 m2 = BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2); + + for (i = 1; i < cacheLineSize; ++i) { + m0 = m0 + BFVec4::load(weigth + i * 16 + 4 * 0) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 0); + m1 = m1 + BFVec4::load(weigth + i * 16 + 4 * 1) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 1); + m2 = m2 + BFVec4::load(weigth + i * 16 + 4 * 2) * BFVec4::load((int16_t*)cacheLine[i] + offset + 4 * 2); + } + + auto o0 = m0 + m1 + m2; + BFVec4::save(dest + 8 * unit + 0 * 4, o0); + } +} +static void _MNNConvDwF23SourceTransUnit(const int16_t *source, int16_t *dest, size_t unit); +static void _MNNSourceTransformCommonF23(const float *sourceF, float *destF, int unit, int iw, int pad, int su, int eu) { + auto source = (const int16_t*)sourceF; + auto dest = (int16_t*)destF; + for (int x = 0; x < su; ++x) { + auto dstX = dest + 4 * 4 * x; + auto sx = x * 2 - (int)pad; + auto ex = sx + 4; + + auto clampSx = std::max(sx, 0); + auto clampEx = std::min(ex, (int)iw); + + BFVec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = clampSx; i < clampEx; ++i) { + v[i - sx] = BFVec4::load(source + 4 * i); + } + auto m0 = v[0] - v[2]; + auto m1 = v[1] + v[2]; + auto m2 = v[2] - v[1]; + auto m3 = v[3] - v[1]; + + BFVec4::save(dstX + 4 * 0, m0); + BFVec4::save(dstX + 4 * 1, m1); + BFVec4::save(dstX + 4 * 2, m2); + BFVec4::save(dstX + 4 * 3, m3); + } + _MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su); + + for (int x = eu; x < unit; ++x) { + auto dstX = dest + 4 * 4 * x; + auto sx = x * 2 - (int)pad; + auto ex = sx + 4; + + auto clampSx = std::max(sx, 0); + auto clampEx = std::min(ex, (int)iw); + + BFVec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = clampSx; i < clampEx; ++i) { + v[i - sx] = BFVec4::load(source + 4 * i); + } + auto m0 = v[0] - v[2]; + auto m1 = v[1] + v[2]; + auto m2 = v[2] - v[1]; + auto m3 = v[3] - v[1]; + + BFVec4::save(dstX + 4 * 0, m0); + BFVec4::save(dstX + 4 * 1, m1); + BFVec4::save(dstX + 4 * 2, m2); + BFVec4::save(dstX + 4 * 3, m3); + } +} + +static void _MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigthF, float *destF, size_t ow) { + int unit = ow / 2; + auto weigth = (const int16_t*)weigthF; + auto dest = (int16_t*)destF; + + auto w00 = BFVec4::load(weigth + 0 * 16 + 4 * 0); + auto w01 = BFVec4::load(weigth + 0 * 16 + 4 * 1); + auto w02 = BFVec4::load(weigth + 0 * 16 + 4 * 2); + auto w03 = BFVec4::load(weigth + 0 * 16 + 4 * 3); + auto w10 = BFVec4::load(weigth + 1 * 16 + 4 * 0); + auto w11 = BFVec4::load(weigth + 1 * 16 + 4 * 1); + auto w12 = BFVec4::load(weigth + 1 * 16 + 4 * 2); + auto w13 = BFVec4::load(weigth + 1 * 16 + 4 * 3); + auto w20 = BFVec4::load(weigth + 2 * 16 + 4 * 0); + auto w21 = BFVec4::load(weigth + 2 * 16 + 4 * 1); + auto w22 = BFVec4::load(weigth + 2 * 16 + 4 * 2); + auto w23 = BFVec4::load(weigth + 2 * 16 + 4 * 3); + for (int x = 0; x < unit; ++x) { + auto offset = 4 * 4 * x; + int i = 0; + BFVec4 m0 = w00 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 0); + BFVec4 m1 = w01 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 1); + BFVec4 m2 = w02 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 2); + BFVec4 m3 = w03 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 3); + + m0 = m0 + w10 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 0); + m1 = m1 + w11 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 1); + m2 = m2 + w12 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 2); + m3 = m3 + w13 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 3); + + m0 = m0 + w20 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 0); + m1 = m1 + w21 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 1); + m2 = m2 + w22 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 2); + m3 = m3 + w23 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 3); + + auto o0 = m0 + m1 + m2; + auto o1 = m1 - m2 + m3; + BFVec4::save(dest + 8 * x + 0 * 4, o0); + BFVec4::save(dest + 8 * x + 1 * 4, o1); + } + if (unit * 2 < ow) { + auto offset = 4 * 4 * unit; + BFVec4 m0 = w00 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 0); + BFVec4 m1 = w01 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 1); + BFVec4 m2 = w02 * BFVec4::load((int16_t*)cacheLine[0] + offset + 4 * 2); + + m0 = m0 + w10 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 0); + m1 = m1 + w11 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 1); + m2 = m2 + w12 * BFVec4::load((int16_t*)cacheLine[1] + offset + 4 * 2); + + m0 = m0 + w20 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 0); + m1 = m1 + w21 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 1); + m2 = m2 + w22 * BFVec4::load((int16_t*)cacheLine[2] + offset + 4 * 2); + auto o0 = m0 + m1 + m2; + BFVec4::save(dest + 8 * unit + 0 * 4, o0); + } +} +static void _MNNConvDwF23SourceTransUnit(const int16_t *source, int16_t *dest, size_t unit) { + if (unit <= 0) { + return; + } + BFVec4 v0 = BFVec4::load(source + 4 * 0); + BFVec4 v1 = BFVec4::load(source + 4 * 1); + BFVec4 v2; + BFVec4 v3; + source += 8; + + for (int x = 0; x < unit; ++x) { + v2 = BFVec4::load(source + 0 * 4); + v3 = BFVec4::load(source + 1 * 4); + auto m0 = v0 - v2; + auto m1 = v1 + v2; + auto m2 = v2 - v1; + auto m3 = v3 - v1; + + BFVec4::save(dest + 4 * 0, m0); + BFVec4::save(dest + 4 * 1, m1); + BFVec4::save(dest + 4 * 2, m2); + BFVec4::save(dest + 4 * 3, m3); + + source += 8; + dest += 16; + + v0 = v2; + v1 = v3; + } +} + +static void _MNNMatrixSub(float* CF, const float* AF, const float* BF, size_t widthC4, size_t cStride, size_t aStride, + size_t bStride, size_t height) { + auto A = (int16_t*)AF; + auto B = (int16_t*)BF; + auto C = (int16_t*)CF; + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y; + auto b = B + bStride * y; + auto c = C + cStride * y; + for (int x = 0; x < widthC4; ++x) { + BFVec4::save(c + 4 * x, BFVec4::load(a + 4 * x) - BFVec4::load(b + 4 * x)); + } + } +} +static void _MNNMatrixAdd(float* CF, const float* AF, const float* BF, size_t widthC4, size_t cStride, size_t aStride, + size_t bStride, size_t height) { + auto A = (int16_t*)AF; + auto B = (int16_t*)BF; + auto C = (int16_t*)CF; + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y; + auto b = B + bStride * y; + auto c = C + cStride * y; + for (int x = 0; x < widthC4; ++x) { + BFVec4::save(c + 4 * x, BFVec4::load(a + 4 * x) + BFVec4::load(b + 4 * x)); + } + } +} + +static void _MNNStrassenMergeCFunction(float* c11F, float* c12F, float* c21F, float* c22F, float* xAddrF, size_t cStride, + size_t eSub, size_t hSub) { + auto c11 = (int16_t*)c11F; + auto c12 = (int16_t*)c12F; + auto c21 = (int16_t*)c21F; + auto c22 = (int16_t*)c22F; + auto xAddr = (int16_t*)xAddrF; + for (int y=0; yMNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16; + gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16; + gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16; + gInstance->MNNFp32ToLowp = _MNNFp32ToLowp; + gInstance->MNNLowpToFp32 = _MNNLowpToFp32; + gInstance->bytes = 2; + gInstance->pack = 4; + gInstance->MNNPackCUnit = (decltype(gInstance->MNNPackCUnit))MNNPackC4Int16; + gInstance->MNNUnpackCUnit = (decltype(gInstance->MNNUnpackCUnit))MNNUnpackC4Int16; + gInstance->MNNUnpackCUnitTranspose = (decltype(gInstance->MNNUnpackCUnitTranspose))MNNPackTransposeInt16; + gInstance->MNNPackCUnitTranspose = (decltype(gInstance->MNNPackCUnitTranspose))MNNUnpackTransposeInt16; + gInstance->MNNConvDwF23MulTransUnit = _MNNConvDwF23MulTransUnit; + gInstance->MNNSourceTransformCommonF23 = _MNNSourceTransformCommonF23; + gInstance->MNNMultiAndDestTransformCommon23 = _MNNMultiAndDestTransformCommon23; + gInstance->MNNMatrixAdd = _MNNMatrixAdd; + gInstance->MNNMatrixSub = _MNNMatrixSub; + gInstance->MNNStrassenMergeCFunction = _MNNStrassenMergeCFunction; + gInstance->penalty = 10.0f; + gInstance->MNNScaleAndAddBias = _MNNScaleAndAddBias; + gInstance->MNNCopyC4WithStride = MNNCopyC4Int16WithStride; + gInstance->MNNAddC4WithStride = _MNNAddC4WithStride; + gInstance->chooseWinoDestTransform = (decltype(gInstance->chooseWinoDestTransform))(WinogradFunctionHalf::chooseDestTransform); + gInstance->chooseWinoSourceTransform = (decltype(gInstance->chooseWinoSourceTransform))(WinogradFunctionHalf::chooseSourceTransform); + gInstance->MNNDeconvRunForLineDepthwise = (decltype(gInstance->MNNDeconvRunForLineDepthwise))_MNNDeconvRunForLineDepthwise; + gInstance->MNNDeconvRunForUnitDepthWise = (decltype(gInstance->MNNDeconvRunForUnitDepthWise))_MNNDeconvRunForUnitDepthWise; + +#if !defined(MNN_USE_SSE) && !defined(MNN_USE_NEON) + gInstance->penalty = 1.5f; + gInstance->MNNPackForMatMul_B = MNNPackForMatMul_B_BF16; // common function MNNPackForMatMul_B_BF16 is needed even with out sse or arm neon. + gInstance->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A_BF16;// + gInstance->MNNPackedMatMul = MNNPackedMatMul_BF16; + gInstance->MNNPackedMatMulRemain = MNNPackedMatMulRemain_BF16; +#endif + +#if defined(MNN_USE_SSE) + gInstance->MNNPackForMatMul_B = _SSE_MNNPackForMatMul_B_BF16; + auto cpuFlags = libyuv::InitCpuFlags(); + if (!(cpuFlags & libyuv::kCpuHasF16C)) { + return false; + } + if (cpuFlags & libyuv::kCpuHasAVX2) { + gInstance->MNNPackForMatMul_B = _AVX_MNNPackForMatMul_B_BF16; + gInstance->MNNGetMatMulPackMode = _AVX_MNNGetMatMulPackMode_BF16; + gInstance->MNNPackC4ForMatMul_A = _AVX_MNNPackC4ForMatMul_A_BF16; + gInstance->MNNPackedMatMul = _AVX_MNNPackedMatMulFMA_BF16; + gInstance->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemainFMA_BF16; + return true; + } +#elif defined(MNN_USE_NEON) + gInstance->MNNPackForMatMul_B = NEON_MNNPackForMatMul_B_BF16; + gInstance->MNNGetMatMulPackMode = NEON_MNNGetMatMulPackMode_BF16; + gInstance->MNNPackC4ForMatMul_A = NEON_MNNPackC4ForMatMul_A_BF16; + gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16; + gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16; + gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16; + gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16; + gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16; + return true; +#endif + // TODO: raw cpu version of bf16 + return true; +} + +CoreFunctions* BF16Functions::get() { + return gInstance; +} +}; diff --git a/source/backend/cpu/bf16/BF16Functions.hpp b/source/backend/cpu/bf16/BF16Functions.hpp new file mode 100644 index 00000000..e6b29a0f --- /dev/null +++ b/source/backend/cpu/bf16/BF16Functions.hpp @@ -0,0 +1,16 @@ +#ifndef BF16Functions_hpp +#define BF16Functions_hpp +#include +#include +#include +#include "core/Macro.h" +#include "../compute/CommonOptFunction.h" +namespace MNN { +class BF16Functions { +public: + static bool init(); + static CoreFunctions* get(); +}; +}; + +#endif diff --git a/source/backend/cpu/bf16/BF16OpRegister.cpp b/source/backend/cpu/bf16/BF16OpRegister.cpp new file mode 100644 index 00000000..d83a64b6 --- /dev/null +++ b/source/backend/cpu/bf16/BF16OpRegister.cpp @@ -0,0 +1,12 @@ +// This file is generated by Shell for ops register +namespace MNN { +extern void ___OpType_Raster__BF16RasterFactory__(); +extern void ___OpType_BinaryOp__BF16BinaryCreator__(); +extern void ___OpType_Pooling__BF16PoolingCreator__(); + +void registerBF16Ops() { +___OpType_Raster__BF16RasterFactory__(); +___OpType_BinaryOp__BF16BinaryCreator__(); +___OpType_Pooling__BF16PoolingCreator__(); +} +} diff --git a/source/backend/cpu/bf16/BF16Pooling.cpp b/source/backend/cpu/bf16/BF16Pooling.cpp new file mode 100644 index 00000000..7818c3ae --- /dev/null +++ b/source/backend/cpu/bf16/BF16Pooling.cpp @@ -0,0 +1,25 @@ +// +// BF16Pooling.cpp +// MNN +// +// Created by MNN on 2020/01/08. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "backend/cpu/CPUPool.hpp" +#include "VecHalf.hpp" +#include "BF16Backend.hpp" + +namespace MNN { +using Vec4Half = MNN::Math::VecHalf<4>; + +class BF16PoolingCreator : public BF16Backend::BF16Creator { + virtual Execution *onCreate(const std::vector &inputs, const std::vector &outputs, + const MNN::Op *op, Backend *backend) const override { + return new CPUPool(backend, op->main_as_Pool()); + } +}; + +REGISTER_BF16_OP_CREATOR(OpType_Pooling, BF16PoolingCreator); + +} // namespace MNN diff --git a/source/backend/cpu/bf16/BF16Raster.cpp b/source/backend/cpu/bf16/BF16Raster.cpp new file mode 100644 index 00000000..46ad602a --- /dev/null +++ b/source/backend/cpu/bf16/BF16Raster.cpp @@ -0,0 +1,23 @@ +// +// BF16Raster.cpp +// MNN +// +// Created by MNN on 2020/5/25. +// Copyright © 2018 Alibaba. All rights reserved. +// +#include "backend/cpu/CPURaster.hpp" +#include "BF16Backend.hpp" +namespace MNN { +class BF16RasterFactory : public BF16Backend::BF16Creator { +public: + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + const MNN::Op* op, Backend* backend) const override { + if (outputs[0]->getType().code != halide_type_float) { + return nullptr; + } + return new CPURaster(backend, 2); + } +}; + +REGISTER_BF16_OP_CREATOR(OpType_Raster, BF16RasterFactory); +} diff --git a/source/backend/cpu/bf16/CMakeLists.txt b/source/backend/cpu/bf16/CMakeLists.txt new file mode 100644 index 00000000..4fa9f7a5 --- /dev/null +++ b/source/backend/cpu/bf16/CMakeLists.txt @@ -0,0 +1,16 @@ + +file(GLOB MNN_BF16_SRCS "${CMAKE_CURRENT_LIST_DIR}/*") + +file(GLOB MNN_BF16_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/*") + +add_library( + MNN_BF16 + OBJECT + ${MNN_BF16_SRCS} + ) + +if (MNN_USE_SSE) + if (MNN_SSE_USE_FP16_INSTEAD) + target_compile_options(MNN_BF16 PRIVATE -DMNN_SSE_USE_FP16_INSTEAD -mf16c) + endif() +endif() diff --git a/source/backend/cpu/bf16/VecHalf.hpp b/source/backend/cpu/bf16/VecHalf.hpp new file mode 100644 index 00000000..32942638 --- /dev/null +++ b/source/backend/cpu/bf16/VecHalf.hpp @@ -0,0 +1,295 @@ +// +// VecHalf.hpp +// MNN +// +// Created by MNN on 2021/01/26. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef VecHalf_hpp +#define VecHalf_hpp +#include "core/Macro.h" +#include +#include // supply std::max and std::min +namespace MNN { +namespace Math { + +template +struct VecHalf { + using VecType = VecHalf; + float value[N]; + VecType operator+(const VecType& lr) { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = value[i] + lr.value[i]; + } + return dst; + } + VecType operator-(const VecType& lr) { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = value[i] - lr.value[i]; + } + return dst; + } + VecType operator*(const VecType& lr) { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = value[i] * lr.value[i]; + } + return dst; + } + VecType operator*(float lr) { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = value[i] * lr; + } + return dst; + } + + VecType& operator=(const VecType& lr) { + for (int i = 0; i < N; ++i) { + value[i] = lr.value[i]; + } + return *this; + } + VecType operator-() { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = -value[i]; + } + return dst; + } + VecHalf() { + } + VecHalf(const float v) { + for (int i = 0; i < N; ++i) { + value[i] = v; + } + } + + VecHalf(const VecType& lr) { + for (int i = 0; i < N; ++i) { + value[i] = lr.value[i]; + } + } + float operator[](size_t i) { + return value[i]; + } + static VecType load(const int16_t* addr) { + VecType v; + auto tempV = (int32_t*)v.value; + for (int i = 0; i < N; ++i) { + tempV[i] = addr[i] << 16; + } + return v; + } + static void save(int16_t* addr, const VecType& v) { + auto tempV = (int32_t*)v.value; + for (int i = 0; i < N; ++i) { + addr[i] = tempV[i] >> 16; + } + } + static VecType max(const VecType& v1, const VecType& v2) { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = std::max(v1.value[i], v2.value[i]); + } + return dst; + } + static VecType min(const VecType& v1, const VecType& v2) { + VecType dst; + for (int i = 0; i < N; ++i) { + dst.value[i] = std::min(v1.value[i], v2.value[i]); + } + return dst; + } +}; + +#if defined(MNN_USE_SSE) +#if defined(_MSC_VER) +#include +#else +#include +#endif + +template<> +struct VecHalf<4> { + using VecType = VecHalf<4>; + __m128 value; + VecType operator+(const VecType& lr) const { + VecType dst = { _mm_add_ps(value, lr.value) }; + return dst; + } + VecType operator-(const VecType& lr) const { + VecType dst = { _mm_sub_ps(value, lr.value) }; + return dst; + } + VecType operator*(const VecType& lr) const { + VecType dst = { _mm_mul_ps(value, lr.value) }; + return dst; + } + VecType operator*(float lr) const { + VecType dst = { _mm_mul_ps(value, _mm_set1_ps(lr)) }; + return dst; + } + + VecType& operator=(const VecType& lr) { + value = lr.value; + return *this; + } + VecType operator-() { + VecType dst; +#if defined(_MSC_VER) + dst.value = _mm_xor_ps(value, _mm_set1_ps(-0.f)); // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC. +#else + dst.value = -value; +#endif + return dst; + } + VecHalf() { + } + VecHalf(const float v) { + value = _mm_set1_ps(v); + } + VecHalf(__m128& v) { + value = v; + } + VecHalf(__m128&& v) { + value = std::move(v); + } + VecHalf(const VecType& lr) { + value = lr.value; + } + VecHalf(VecType&& lr) { + value = std::move(lr.value); + } + float operator[](size_t i) { +#if defined(_MSC_VER) // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension. + float temp[4]; + _mm_storeu_ps(temp, value); + return temp[i]; +#else + return value[i]; +#endif + } + static VecType load(const int16_t* addr) { + auto temp = _mm_loadl_epi64((__m128i*)addr); +#ifndef MNN_SSE_USE_FP16_INSTEAD + auto zero = _mm_xor_si128(temp, temp); + auto res = _mm_castsi128_ps(_mm_unpacklo_epi16(zero, temp)); +#else + auto res = _mm_cvtph_ps(temp); +#endif + VecType v = { std::move(res) }; + return v; + } + static void save(int16_t* addr, const VecType& v) { +#ifndef MNN_SSE_USE_FP16_INSTEAD + auto temp = _mm_castps_si128(v.value); + temp = _mm_srai_epi32(temp, 16); + temp = _mm_packs_epi32(temp, temp); +#else + static __m128 gMinValue = _mm_set1_ps(-32768); + static __m128 gMaxValue = _mm_set1_ps(32767); + auto t = _mm_max_ps(v.value, gMinValue); + t = _mm_min_ps(t, gMaxValue); + auto temp = _mm_cvtps_ph(t, 0x8); +#endif + _mm_storel_epi64((__m128i*)addr, temp); + } + static VecType max(const VecType& v1, const VecType& v2) { + VecType dst = { _mm_max_ps(v1.value, v2.value) }; + return dst; + } + static VecType min(const VecType& v1, const VecType& v2) { + VecType dst = { _mm_min_ps(v1.value, v2.value) }; + return dst; + } +}; +#endif + +#if defined(MNN_USE_NEON) +#include + +template<> +struct VecHalf<4> { + using VecType = VecHalf<4>; + float32x4_t value; + VecType operator+(const VecType& lr) const { + VecType dst = { vaddq_f32(value, lr.value) }; + return dst; + } + VecType operator-(const VecType& lr) const { + VecType dst = { vsubq_f32(value, lr.value) }; + return dst; + } + VecType operator*(const VecType& lr) const { + VecType dst = { vmulq_f32(value, lr.value) }; + return dst; + } + VecType operator*(const float lr) const { + VecType dst = { vmulq_f32(value, vdupq_n_f32(lr)) }; + return dst; + } + + VecType& operator=(const VecType& lr) { + value = lr.value; + return *this; + } + VecType operator-() { + VecType dst = { vnegq_f32(value) }; + return dst; + } + VecHalf() { + } + VecHalf(const float v) { + value = vdupq_n_f32(v); + } + VecHalf(float32x4_t& v) { + value = v; + } + VecHalf(float32x4_t&& v) { + value = std::move(v); + } + VecHalf(const VecType& lr) { + value = lr.value; + } + VecHalf(VecType&& lr) { + value = std::move(lr.value); + } + float operator[](const int i) { + // vgetq_lane_f32(value, i) does NOT work, i must be const number such as 0, 2, + return value[i]; + } + + static VecType load(const int16_t* addr) { + + // equivalent to this: + // int16x4_t vec4s16 = vld1_s16(addr); // load bf16 data as fixed point data of 16-bit. + // int32x4_t vec4s32 =vshll_n_s16(vec4s16, 16); // shift left 16bit as 32-bit data. + // float32x4_t vec4f32 = vreinterpretq_f32_s32(vec4s32);// treat 32-bit fix point result as float32 data + // VecType dest = { vec4f32 }; // construct a struct of VecType + + VecType dst = { vreinterpretq_f32_s32(vshll_n_s16(vld1_s16(addr), 16)) }; + return dst; + } + static void save(int16_t* addr, const VecType& v) { + vst1_s16(addr, vshrn_n_s32(vreinterpretq_s32_f32(v.value), 16)); + return; + } + static VecType max(const VecType& v1, const VecType& v2) { + VecType dst = { vmaxq_f32(v1.value, v2.value) }; + return dst; + } + static VecType min(const VecType& v1, const VecType& v2) { + VecType dst = { vminq_f32(v1.value, v2.value) }; + return dst; + } +}; +#endif + +} + +} +#endif diff --git a/source/backend/cpu/bf16/WinogradOptFunctionHalf.cpp b/source/backend/cpu/bf16/WinogradOptFunctionHalf.cpp new file mode 100644 index 00000000..e31a956f --- /dev/null +++ b/source/backend/cpu/bf16/WinogradOptFunctionHalf.cpp @@ -0,0 +1,199 @@ +// +// WinogradOptFunctionHalf.cpp +// MNN +// +// Created by MNN on 2021/03/12. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "WinogradOptFunctionHalf.hpp" +#include +#include +#include "core/Macro.h" +#include "VecHalf.hpp" +using BFVec4 = MNN::Math::VecHalf<4>; + +namespace MNN { +static void _sourceTransformUnit4x4(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + + auto m0 = s0 - s2; + auto m1 = s1 + s2; + auto m2 = s2 - s1; + auto m3 = s3 - s1; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); + BFVec4::save(dstStart + 2 * dstStep, m2); + BFVec4::save(dstStart + 3 * dstStep, m3); +} +static void _destTransformUnit4x2(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + + auto m0 = s0 + s1 + s2; + auto m1 = (s1 - s2) + s3; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); +} +static void _destTransformUnit4x3(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + + auto m0 = s0 + s1 + s2; + auto m1 = (s1 - s2); + auto m2 = (s1 + s2) + s3; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); + BFVec4::save(dstStart + 2 * dstStep, m2); +} + + +#define LOAD6 \ +BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); \ +BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); \ +BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); \ +BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); \ +BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep); \ +BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep); + +static void _sourceTransformUnit6x6(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + LOAD6; + BFVec4 m0 = s0 * 4.f - s2 * 5.f + s4; + + BFVec4 m1 = (s1 + s2) * (-4.f) + (s3 + s4); + BFVec4 m2 = (s1 - s2) * (4.f) + (s4 - s3); + + BFVec4 m3 = s1 * -2.f - s2 + s3 * 2.f + s4; + BFVec4 m4 = s1 * 2.f - s2 - s3 * 2.f + s4; + + BFVec4 m5 = s1 * 4.f - s3 * 5.f + s5; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); + BFVec4::save(dstStart + 2 * dstStep, m2); + BFVec4::save(dstStart + 3 * dstStep, m3); + BFVec4::save(dstStart + 4 * dstStep, m4); + BFVec4::save(dstStart + 5 * dstStep, m5); +} + +static void _destTransformUnit6x5(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep); + BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep); + + auto m0 = s0 + s1 + s2 + s3 + s4; + auto m1 = (s1 - s2) + (s3 - s4) * 2.f; + auto m2 = (s1 + s2) + (s3 + s4) * 4.f; + auto m3 = (s1 - s2) + (s3 - s4) * 8.f; + auto m4 = (s1 + s2) + (s3 + s4) * 16.f + s5; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); + BFVec4::save(dstStart + 2 * dstStep, m2); + BFVec4::save(dstStart + 3 * dstStep, m3); + BFVec4::save(dstStart + 4 * dstStep, m4); +} +static void _destTransformUnit6x4(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep); + BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep); + auto v0 = s3 + s4; + auto v1 = s3 - s4; + auto v2 = s1 + s2; + auto v3 = s1 - s2; + + auto m0 = s0 + v2 + v0; + auto m1 = v3 + v1 + v1; + auto m2 = v2 + v0 * 4.f; + auto m3 = v3 + v1 * 8.f + s5; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); + BFVec4::save(dstStart + 2 * dstStep, m2); + BFVec4::save(dstStart + 3 * dstStep, m3); +} +static void _destTransformUnit6x3(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep); + BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep); + + auto m0 = s0 + s1 + s2 + s3 + s4; + auto m1 = (s1 - s2) + (s3 - s4) * 2.f; + auto m2 = (s1 + s2) + (s3 + s4) * 4.f + s5; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); + BFVec4::save(dstStart + 2 * dstStep, m2); +} +static void _destTransformUnit6x2(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep) { + BFVec4 s0 = BFVec4::load(srcBlock + 0 * srcStep); + BFVec4 s1 = BFVec4::load(srcBlock + 1 * srcStep); + BFVec4 s2 = BFVec4::load(srcBlock + 2 * srcStep); + BFVec4 s3 = BFVec4::load(srcBlock + 3 * srcStep); + BFVec4 s4 = BFVec4::load(srcBlock + 4 * srcStep); + BFVec4 s5 = BFVec4::load(srcBlock + 5 * srcStep); + + auto m0 = s0 + s1 + s2 + s3 + s4; + auto m1 = (s1 - s2) + (s3 - s4) * 2.f + s5; + + BFVec4::save(dstStart + 0 * dstStep, m0); + BFVec4::save(dstStart + 1 * dstStep, m1); +} + +static WinogradFunctionHalf::TransformFunc gProcUnit6[] = { + nullptr, // 0 + nullptr, // 1 + _destTransformUnit6x2, + _destTransformUnit6x3, + _destTransformUnit6x4, + _destTransformUnit6x5, +}; + + +WinogradFunctionHalf::TransformFunc WinogradFunctionHalf::chooseSourceTransform(int k, int w) { + if (6 == k && 6 == w) { + return _sourceTransformUnit6x6; + } + if (4 == k && 4 == w) { + return _sourceTransformUnit4x4; + } + MNN_ASSERT(false); + return nullptr; +} + +WinogradFunctionHalf::TransformFunc WinogradFunctionHalf::chooseDestTransform(int k, int h) { + if (6 == k) { + if (h <= 1 || h > 5) { + return nullptr; + } + return gProcUnit6[h]; + } + if (2 == h && 4 == k) { + return _destTransformUnit4x2; + } + if (3 == h && 4 == k) { + return _destTransformUnit4x3; + } + return nullptr; +} + +} // namespace MNN diff --git a/source/backend/cpu/bf16/WinogradOptFunctionHalf.hpp b/source/backend/cpu/bf16/WinogradOptFunctionHalf.hpp new file mode 100644 index 00000000..e7738b7d --- /dev/null +++ b/source/backend/cpu/bf16/WinogradOptFunctionHalf.hpp @@ -0,0 +1,26 @@ +// +// WinogradOptFunctionHalf.hpp +// MNN +// +// Created by MNN on 2021/03/12. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef WinogradOptFunctionHalf_hpp +#define WinogradOptFunctionHalf_hpp + +#include +#include + +namespace MNN { +class WinogradFunctionHalf { +public: + typedef void (*TransformFunc)(const int16_t* srcBlock, int16_t* dstStart, size_t srcStep, size_t dstStep); + + /*Use the generator with interp 0.5*/ + static TransformFunc chooseSourceTransform(int k, int w); + static TransformFunc chooseDestTransform(int k, int h); +}; +} // namespace MNN + +#endif /* WinogradOptFunctionHalf_hpp */ diff --git a/source/backend/cpu/bf16/register.py b/source/backend/cpu/bf16/register.py new file mode 100644 index 00000000..d91b56ae --- /dev/null +++ b/source/backend/cpu/bf16/register.py @@ -0,0 +1,40 @@ +#!/usr/bin/python +import os +def generateCPUFile(rootDir): + cpuDir = rootDir + cpuRegFile = os.path.join(cpuDir, "BF16OpRegister.cpp") + fileNames = os.listdir(cpuDir) + print(fileNames) + if len(fileNames) <= 1: + # Error dirs + return + funcNames = [] + for fi in fileNames: + f = os.path.join(cpuDir, fi) + if os.path.isdir(f): + continue + print(f) + with open(f) as fileC: + c = fileC.read().split('\n') + c = list(filter(lambda l:l.find('REGISTER_BF16_OP_CREATOR')>=0, c)) + c = list(filter(lambda l:l.find('OpType')>=0, c)) + for l in c: + l = l.split('(')[1] + l = l.split(')')[0] + l = l.replace(' ', '') + l = l.split(',') + funcName = '___' + l[0] + '__' + l[1] + '__' + funcNames.append(funcName) + with open(cpuRegFile, 'w') as f: + f.write('// This file is generated by Shell for ops register\n') + f.write('namespace MNN {\n') + for l in funcNames: + f.write("extern void " + l + '();\n') + f.write('\n') + f.write('void registerBF16Ops() {\n') + for l in funcNames: + f.write(l+'();\n') + f.write("}\n}\n") + +import sys +generateCPUFile(sys.argv[1]) diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp index 079f4eb1..08485e7f 100644 --- a/source/backend/cpu/compute/CommonOptFunction.cpp +++ b/source/backend/cpu/compute/CommonOptFunction.cpp @@ -7,22 +7,143 @@ // #include "CommonOptFunction.h" +#include "ConvOpt.h" +#include "WinogradOptFunction.hpp" #include #include #include #include "math/Vec.hpp" #include -int MNNGetC4DivNumber(int h) { - auto remain = h % 4; - if (0 == remain) { - return h / 4; - } - if (4 % remain == 0) { - return h / remain; - } - return h; + +#ifndef MNN_USE_NEON + +void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) { + *eP = 16; + *lP = 1; + *hP = 4; } +template +void MNNPackForMatMul_B_Template(DataType* dest, const DataType* source, size_t h, size_t l, bool transpose) { + auto hP = h / 4; + auto hR = hP * 4; + if (hR != h) { + ::memset(dest, 0, UP_DIV(h, 4)*4*l*sizeof(DataType)); + } + if (!transpose) { + for (int y=0; y 0) { + auto destY = dest + hP * 4 * l; + auto sourceY = source + hP * 4; + for (int x=0; x(dest, source, h, l, transpose); +} + +void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias) { + return MNNPackedMatMulRemain(C, A, B, 16, parameter, postParameters, bias); + //return _AVX_MNNPackedMatMulFMA(C, A, B, parameter, cache); +} + +void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) { + auto h = parameter[2]; + auto l = parameter[1]; + auto cStride = parameter[3] / sizeof(float); + auto hRemain = parameter[4]; + auto bExtraStride = parameter[5] / sizeof(float); + auto bStride = bExtraStride + l * 4; + auto hC4 = UP_DIV(h, 4); + for (int y=0; y().max(); + float maxValue = std::numeric_limits().max(); + if (nullptr != postParameters) { + minValue = postParameters[2]; + maxValue = postParameters[3]; + alpha = postParameters[0]; + beta = postParameters[1]; + } + + for (int x=0; x; -void MNNScaleAndAddBiasOutside(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, - size_t biasNumber) { - for (size_t p = 0; p < planeNumber; ++p) { - float* dstPlane = dst + p * biasNumber; - const float* srcPlane = src + p * biasNumber; - for (int z = 0; z < biasNumber; ++z) { - dstPlane[z] = srcPlane[z] * alpha[z] + bias[z]; - } - } -} - - - #ifndef MNN_USE_NEON #ifndef MNN_USE_SSE -void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - for (int z = 0; z < biasNumber; ++z) { - float* dstZ = dst + planeNumber * 4 * z; - const float* biasZ = bias + 4 * z; - for (int p = 0; p < planeNumber; ++p) { - float* dstX = dstZ + 4 * p; - for (int i = 0; i < 4; ++i) { - dstX[i] += biasZ[i]; - } - } - } -} - -void MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - for (int z = 0; z < biasNumber; ++z) { - float* dstZ = dst + planeNumber * 4 * z; - const float* biasZ = bias + 4 * z; - for (int p = 0; p < planeNumber; ++p) { - float* dstX = dstZ + 4 * p; - for (int i = 0; i < 4; ++i) { - dstX[i] += biasZ[i]; - if (dstX[i] < 0) { - dstX[i] = 0; - } - } - } - } -} - -void MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - for (int z = 0; z < biasNumber; ++z) { - float* dstZ = dst + planeNumber * 4 * z; - const float* biasZ = bias + 4 * z; - for (int p = 0; p < planeNumber; ++p) { - float* dstX = dstZ + 4 * p; - for (int i = 0; i < 4; ++i) { - dstX[i] += biasZ[i]; - if (dstX[i] < 0) { - dstX[i] = 0; - } - if (dstX[i] > 6.0f) { - dstX[i] = 6.0f; - } - } - } - } -} void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) { for (int i = 0; i < count; ++i) { @@ -225,122 +282,6 @@ void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) { } } -void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) { - *eP = 16; - *lP = 1; - *hP = 6; -} - -void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) { - auto hP = h / 6; - auto hR = hP * 6; - if (hR != h) { - ::memset(dest, 0, UP_DIV(h, 6)*6*l*sizeof(float)); - } - if (!transpose) { - for (int y=0; y 0) { - auto destY = dest + hP * 6 * l; - auto sourceY = source + hP * 6; - for (int x=0; x().max(); - float maxValue = std::numeric_limits().max(); - if (nullptr != postParameters) { - minValue = postParameters[2]; - maxValue = postParameters[3]; - alpha = postParameters[0]; - beta = postParameters[1]; - } - - for (int x=0; x 0) { + MNNHardSwish(dst, src, sizeQuad); + start = sizeQuad * 4; + } +#endif +#ifdef MNN_USE_NEON + float32x4_t zero = vdupq_n_f32(0.f); + float32x4_t three = vdupq_n_f32(3.f); + float32x4_t six = vdupq_n_f32(6.f); + float32x4_t divsix = vdupq_n_f32(1.0f/6.f); + for (int i = 0; i < sizeQuad; i++) { + auto x = vld1q_f32(src + 4 * i); + auto y = vmulq_f32(vmulq_f32(x, vminq_f32(vmaxq_f32(vaddq_f32(x, three), zero), six)), divsix); + vst1q_f32(dst + 4 * i, y); + } + start = sizeQuad * 4; +#endif + for (int j = start; j < size; j++) { + if (src[j] <= -3) { + dst[j] = 0; + } else if (src[j] >= 3){ + dst[j] = src[j]; + } else { + dst[j] = src[j] * (src[j] + 3) / 6.f; + } + } +} + void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number) { int numberC4 = (int)number / 4; int start = 0; @@ -887,7 +873,7 @@ void MNNAxByClamp(float* C, const float* A, const float* B, size_t width, size_t } } #ifndef MNN_USE_NEON -void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) { +void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) { auto minF = Vec4(parameters[2]); auto maxF = Vec4(parameters[3]); auto beta = Vec4(parameters[1]); @@ -905,7 +891,6 @@ void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t wi } } } - void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit) { float maxV = input[0]; int maxIdx = 0; @@ -994,3 +979,399 @@ void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const floa } } #endif + +void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) { + int z, x; + int cur = 0; + memset(dst, 0, area * UP_DIV(depth, 4) * 4 * sizeof(int16_t)); + for (z = 0; z < depth; ++z) { + int plane = z / 4; + int16_t* dstPlane = plane * area * 4 + dst; + int offset = z % 4; + for (x = 0; x < area; ++x) { + dstPlane[4 * x + offset] = src[cur++]; + } + } +} + +void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) { + int x; + int z; + int cur = 0; + for (z = 0; z < depth; ++z) { + int plane = z / 4; + const int16_t* srcPlane = plane * area * 4 + src; + int offset = z % 4; + for (x = 0; x < area; ++x) { + dst[cur++] = srcPlane[4 * x + offset]; + } + } +} + +void MNNUnpackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) { + if (depth == 4) { + ::memcpy(dst, src, area * depth * sizeof(int16_t)); + return; + } + int c = (int)depth; + int cDiv4 = c / 4; + int cAlign = cDiv4 * 4; + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = (src + hi * c); + auto dstHeight = (dst + hi * 4); + for (int ci = 0; ci < cDiv4; ++ci) { + for (int i = 0; i < 4; ++i) { + dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i]; + } + } + } + + if (cAlign == c) { + return; + } + + int cReamin = c - cAlign; + auto srcAlign = src + cAlign; + auto dstAlign = dst + area * cAlign; + + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = srcAlign + hi * c; + auto dstHeight = dstAlign + hi * 4; + for (int i = 0; i < 4; ++i) { + dstHeight[i] = 0; + } + for (int ci = 0; ci < cReamin; ++ci) { + dstHeight[ci] = srcHeight[ci]; + } + } +} +void MNNPackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) { + if (1 == area) { + ::memcpy(dst, src, depth * sizeof(int16_t)); + return; + } + int c = (int)depth; + int cDiv4 = c / 4; + int cAlign = cDiv4 * 4; + if (cAlign == c) { + int64_t* dst32 = (int64_t*)dst; + const int64_t* src32 = (int64_t*)src; + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = src32 + hi; + auto dstHeight = dst32 + hi * cDiv4; + for (int ci = 0; ci < cDiv4; ++ci) { + dstHeight[ci] = srcHeight[ci * area]; + } + } + return; + } + + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = src + hi * 4; + auto dstHeight = dst + hi * c; + for (int ci = 0; ci < cDiv4; ++ci) { + for (int i = 0; i < 4; ++i) { + dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i]; + } + } + } + + int cReamin = c - cAlign; + auto srcAlign = src + area * cAlign; + auto dstAlign = dst + cAlign; + + for (int hi = 0; hi < area; ++hi) { + auto srcHeight = srcAlign + hi * 4; + auto dstHeight = dstAlign + hi * c; + + for (int ci = 0; ci < cReamin; ++ci) { + dstHeight[ci] = srcHeight[ci]; + } + } +} + +void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count) { + auto source = (int16_t*)sourceF; + auto dest = (int16_t*)destF; + for (int i = 0; i < count; ++i) { + auto s = source + i * srcStride; + auto d = dest + i * dstStride; + *(int64_t*)(d) = *((int64_t*)s); + } +} + + +void MNNSin(float* dst, const float* src, size_t dataSize) { + for (int i = 0; i < dataSize; i++) { + dst[i] = sinf(src[i]); + } +} + +void MNNSigmoid(float* dst, const float* src, size_t dataSize) { + MNNExp(dst, src, dataSize); + for (int i = 0; i < dataSize; ++i) { + dst[i] = 1.0f / (1.0f + dst[i]); + } +} + +/** + Modified from https://github.com/alibaba/MNN/pull/1359 + Thanks for https://github.com/hroken + */ +void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) { + MNNExp(dst, src, dataSize); +#ifdef MNN_USE_NEON + int dataC4 = (int)dataSize / 4; + if(dataC4 > 0) { + // neon optimization for sigmid cpu + float32x4_t value = vdupq_n_f32(1.0f); + float32x4_t out = vld1q_f32(dst); + for (int i = 1; i < dataC4; ++i) { + out = vrecpeq_f32(vaddq_f32(value,out)); + vst1q_f32(dst ,out); + dst += 4; + out = vld1q_f32(dst); + } + out = vrecpeq_f32(vaddq_f32(value,out)); + vst1q_f32(dst, out); + dataSize = dataSize - 4 * dataC4; + } +#endif + for (int i = 0; i < dataSize; ++i) { + dst[i] = 1.0f / (1.0f + dst[i]); + } +} +extern "C" { +void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow); +} + +void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow) { + int unit = ow / 2; + MNN_ASSERT(cacheLineSize >= 1); + for (int x = 0; x < unit; ++x) { + auto offset = 4 * 4 * x; + int i = 0; + Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); + Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); + Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); + Vec4 m3 = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3); + + for (i = 1; i < cacheLineSize; ++i) { + m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); + m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); + m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); + m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3); + } + + auto o0 = m0 + m1 + m2; + auto o1 = m1 - m2 + m3; + Vec4::save(dest + 8 * x + 0 * 4, o0); + Vec4::save(dest + 8 * x + 1 * 4, o1); + } + if (unit * 2 < ow) { + auto offset = 4 * 4 * unit; + int i = 0; + Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); + Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); + Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); + + for (i = 1; i < cacheLineSize; ++i) { + m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); + m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); + m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); + } + + auto o0 = m0 + m1 + m2; + Vec4::save(dest + 8 * unit + 0 * 4, o0); + } +} +extern "C" { +void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit); +} + +void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) { + for (int x = 0; x < su; ++x) { + auto dstX = dest + 4 * 4 * x; + auto sx = x * 2 - (int)pad; + auto ex = sx + 4; + + auto clampSx = std::max(sx, 0); + auto clampEx = std::min(ex, (int)iw); + + Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = clampSx; i < clampEx; ++i) { + v[i - sx] = Vec4::load(source + 4 * i); + } + auto m0 = v[0] - v[2]; + auto m1 = v[1] + v[2]; + auto m2 = v[2] - v[1]; + auto m3 = v[3] - v[1]; + + Vec4::save(dstX + 4 * 0, m0); + Vec4::save(dstX + 4 * 1, m1); + Vec4::save(dstX + 4 * 2, m2); + Vec4::save(dstX + 4 * 3, m3); + } + MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su); + + for (int x = eu; x < unit; ++x) { + auto dstX = dest + 4 * 4 * x; + auto sx = x * 2 - (int)pad; + auto ex = sx + 4; + + auto clampSx = std::max(sx, 0); + auto clampEx = std::min(ex, (int)iw); + + Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = clampSx; i < clampEx; ++i) { + v[i - sx] = Vec4::load(source + 4 * i); + } + auto m0 = v[0] - v[2]; + auto m1 = v[1] + v[2]; + auto m2 = v[2] - v[1]; + auto m3 = v[3] - v[1]; + + Vec4::save(dstX + 4 * 0, m0); + Vec4::save(dstX + 4 * 1, m1); + Vec4::save(dstX + 4 * 2, m2); + Vec4::save(dstX + 4 * 3, m3); + } +} + +#ifndef MNN_USE_NEON +void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow) { + int unit = ow / 2; + auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0); + auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1); + auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2); + auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3); + auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0); + auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1); + auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2); + auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3); + auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0); + auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1); + auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2); + auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3); + for (int x = 0; x < unit; ++x) { + auto offset = 4 * 4 * x; + int i = 0; + Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0); + Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1); + Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2); + Vec4 m3 = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3); + + m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0); + m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1); + m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2); + m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3); + + m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0); + m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1); + m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2); + m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3); + + auto o0 = m0 + m1 + m2; + auto o1 = m1 - m2 + m3; + Vec4::save(dest + 8 * x + 0 * 4, o0); + Vec4::save(dest + 8 * x + 1 * 4, o1); + } + if (unit * 2 < ow) { + auto offset = 4 * 4 * unit; + Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0); + Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1); + Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2); + + m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0); + m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1); + m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2); + + m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0); + m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1); + m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2); + auto o0 = m0 + m1 + m2; + Vec4::save(dest + 8 * unit + 0 * 4, o0); + } +} +void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) { + if (unit <= 0) { + return; + } + Vec4 v0 = Vec4::load(source + 4 * 0); + Vec4 v1 = Vec4::load(source + 4 * 1); + Vec4 v2; + Vec4 v3; + source += 8; + + for (int x = 0; x < unit; ++x) { + v2 = Vec4::load(source + 0 * 4); + v3 = Vec4::load(source + 1 * 4); + auto m0 = v0 - v2; + auto m1 = v1 + v2; + auto m2 = v2 - v1; + auto m3 = v3 - v1; + + Vec4::save(dest + 4 * 0, m0); + Vec4::save(dest + 4 * 1, m1); + Vec4::save(dest + 4 * 2, m2); + Vec4::save(dest + 4 * 3, m3); + + source += 8; + dest += 16; + + v0 = v2; + v1 = v3; + } +} +#endif + +namespace MNN { + +static CoreFunctions* gCoreFunction = nullptr; + +void MNNCoreFunctionInit() { + gCoreFunction = new CoreFunctions; + // MatMul + gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode; + gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A; + gCoreFunction->MNNPackForMatMul_B = MNNPackForMatMul_B; + gCoreFunction->MNNPackedMatMul = MNNPackedMatMul; + gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain; + + // Lowp + gCoreFunction->MNNFp32ToLowp = nullptr; + gCoreFunction->MNNLowpToFp32 = nullptr; + gCoreFunction->bytes = 4;// sizeof(float) + + // Packed Function + gCoreFunction->pack = 4; + gCoreFunction->MNNPackCUnit = MNNPackC4; + gCoreFunction->MNNUnpackCUnit = MNNUnpackC4; + + // FIXME: MNNPackTranspose and MNNUnpackTranspose is reverted + gCoreFunction->MNNUnpackCUnitTranspose = MNNPackTranspose; + gCoreFunction->MNNPackCUnitTranspose = MNNUnpackTranspose; + gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit; + gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise; + gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise; + gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23; + gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit; + gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23; + gCoreFunction->MNNMatrixAdd = MNNMatrixAdd; + gCoreFunction->MNNMatrixSub = MNNMatrixSub; + gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction; + gCoreFunction->penalty = 1.5f; + gCoreFunction->MNNScaleAndAddBias = MNNScaleAndAddBias; + gCoreFunction->MNNAddC4WithStride = MNNAddC4WithStride; + gCoreFunction->MNNCopyC4WithStride = MNNCopyC4WithStride; + + gCoreFunction->chooseWinoSourceTransform = WinogradFunction::chooseSourceTransform; + gCoreFunction->chooseWinoDestTransform = WinogradFunction::chooseDestTransform; + gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise; + gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise; + MNNFunctionInit(); +} +CoreFunctions* MNNGetCoreFunctions() { + return gCoreFunction; +} +}; diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h index 942c5d01..afefb6c8 100644 --- a/source/backend/cpu/compute/CommonOptFunction.h +++ b/source/backend/cpu/compute/CommonOptFunction.h @@ -15,13 +15,7 @@ #include "core/Macro.h" -#ifdef __cplusplus extern "C" { -#endif - -void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber); -void MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber); -void MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber); void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope); @@ -31,29 +25,32 @@ void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size); void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad); +void MNNHardSwish(float* dst, const float* src, size_t size); + void MNNPackC4(float* dst, const float* src, size_t area, size_t depth); +void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth); + void MNNPackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth); void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth); +void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth); + void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth); void MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber); void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number); -void MNNScaleAndAddBiasOutside(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, - size_t biasNumber); - void MNNUnpackTranspose(float* dst, const float* src, size_t area, size_t depth); +void MNNUnpackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth); void MNNUnpackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth); void MNNPackTranspose(float* dst, const float* src, size_t area, size_t depth); +void MNNPackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth); void MNNPackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth); -void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth); - void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count); void MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count); @@ -67,26 +64,44 @@ void MNNExpC8(float* dest, const float* source, const float* parameters, size_t void MNNPowC8(float* dest, const float* source, const float* powfParam, size_t betaInt, size_t countC8); void MNNExp(float* dst, const float* src, size_t dataSize); +void MNNSin(float* dst, const float* src, size_t dataSize); void MNNTanh(float* dst, const float* src, size_t dataSize); +void MNNSigmoid(float* dst, const float* src, size_t dataSize); +void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize); void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slope); -bool MNNReorder4x4ByPlatform(float* dst, size_t size); +void MNNHardSwishCommon(float* dst, const float* src, size_t size); // Get Pack for MatMul's e , l , h , the pack number must be 1 or 4 * n void MNNGetMatMulPackMode(int* eP, int *lP, int* hP); -void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal); + + +/** + int number = info[0]; + int eSrcStride = info[1]; + int eDstStride = info[2]; + int xStride = info[3]; + +el: number * 4 + 0: e + 1: l + 2: e-offset + 3: l-offset + */ +void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el); + void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose); // parameters: e, l, h, CStride, AStride, BStride -void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, const float* postParameters, const float* bias); +void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); void MNNFunctionInit(); -void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, float* cache, const float* postParameters, const float* bias); +void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); int MNNGetC4DivNumber(int hP); // C = clamp(alpha * A + beta * B, min, max) // paramters: alpha, beta, min, max void MNNAxByClamp(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height, const float* parameters); -void MNNAxByClampBroadcastC4(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters); +void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters); // dim: 4-element, sizeDW, sizeDH, strideSW, strideDH void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim); // not C4 @@ -103,11 +118,72 @@ struct MatMulParam { }; void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId); +void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count); +void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu); +void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow); +void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow); #ifdef MNN_USE_SSE void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count); #endif -#ifdef __cplusplus } -#endif + +// c++ template function should not in extern C +template +void MNNPackForMatMul_B_Template(DataType* dest, const DataType* source, size_t h, size_t l, bool transpose); + +namespace MNN { +struct CoreFunctions { + /**MatMul Pack and Functions*/ + void(*MNNGetMatMulPackMode)(int* eP, int *lP, int* hP); + void(*MNNPackC4ForMatMul_A)(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el); + void(*MNNPackForMatMul_B)(float* dest, const float* source, size_t h, size_t l, bool transpose); + // parameters: e, l, h, CStride, AStride, BStride + void(*MNNPackedMatMul)(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); + void(*MNNPackedMatMulRemain)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); + + /**Lowp Backend Setting*/ + void(*MNNFp32ToLowp)(const float* src, int16_t* dst, size_t size); + void(*MNNLowpToFp32)(const int16_t* src, float* dst, size_t size); + int bytes; + + /**NC4HW4's Functions*/ + int pack; + void(*MNNPackCUnit)(float* dst, const float* src, size_t area, size_t depth); + void(*MNNUnpackCUnit)(float* dst, const float* src, size_t area, size_t depth); + void(*MNNPackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth); + void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth); + + void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh, + size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); + void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, + size_t srcHStep, size_t dstHStep); + void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters); + void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow); + void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu); + void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow); + void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, + size_t bStride, size_t height); + void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, + size_t bStride, size_t height); + void(*MNNStrassenMergeCFunction)(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub); + void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber); + float penalty; + + void(*MNNCopyC4WithStride)(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count); + void(*MNNAddC4WithStride)(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count); + + typedef void (*WinoTransFunc)(const float* srcBlock, float* dstStart, size_t srcStep, size_t dstStep); + WinoTransFunc(*chooseWinoSourceTransform)(int k, int w); + WinoTransFunc(*chooseWinoDestTransform)(int k, int h); + + void(*MNNDeconvRunForUnitDepthWise)(const float* dst, float* src, const float* weight, size_t fw, size_t fh, + size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); + void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step); +}; +void MNNCoreFunctionInit(); +CoreFunctions* MNNGetCoreFunctions(); +}; #endif /* CommonOptFunction_h */ diff --git a/source/backend/cpu/compute/ConvInt83x3.cpp b/source/backend/cpu/compute/ConvInt83x3.cpp index 122653df..380727f4 100644 --- a/source/backend/cpu/compute/ConvInt83x3.cpp +++ b/source/backend/cpu/compute/ConvInt83x3.cpp @@ -2,6 +2,7 @@ #include "backend/cpu/CPUBackend.hpp" #include "core/Macro.h" #include "core/Concurrency.h" +#include "core/TensorUtils.hpp" #include "ConvOpt.h" #include "backend/cpu/compute/ConvOpt.h" #include "Int8FunctionsOpt.h" @@ -245,15 +246,37 @@ ConvInt83x3::ConvInt83x3(Backend *backend, const MNN::Convolution2D *convParam, // mWeightInt8 is used to store untransformed reordered weight mWeightInt8.reset(Tensor::createDevice({UP_DIV(outputCount, 4), UP_DIV(srcCount, unitI), 9, unitI * 4})); - bool allocRes = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC); + bool res = backend->onAcquireBuffer(mWeightInt8.get(), Backend::STATIC); + if (!res) { + return; + } + const int outputChannleUp4 = ALIGN_UP4(outputCount); + mBiasFloat.reset(Tensor::createDevice({outputChannleUp4})); + res = backend->onAcquireBuffer(mBiasFloat.get(), Backend::STATIC); + if (!res) { + mValid = false; + return; + } + mScaleFloat.reset(Tensor::createDevice({outputChannleUp4})); + res = backend->onAcquireBuffer(mScaleFloat.get(), Backend::STATIC); + if (!res) { + mValid = false; + return; + } + auto biasPtr = mBiasFloat->host(); + memset(biasPtr, 0, outputChannleUp4 * sizeof(int32_t)); + auto scalePtr = mScaleFloat->host(); + memset(scalePtr, 0, outputChannleUp4 * sizeof(float)); const int8_t *weightSrc = nullptr; std::shared_ptr quanCommon; - if (convParam->quanParameter() != nullptr) { - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false); - weightSrc = quanCommon->weight.get(); - } else { - weightSrc = convParam->symmetricQuan()->weight()->data(); + float inputScale = TensorUtils::getDescribe(inputs[0])->quantAttr ? + TensorUtils::getDescribe(inputs[0])->quantAttr->scale : 0.f; + float outputScale = TensorUtils::getDescribe(outputs[0])->quantAttr ? + TensorUtils::getDescribe(outputs[0])->quantAttr->scale : 0.f; + if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) { + return; } + auto weightDst = mWeightInt8->host(); CPUConvolution::reorderWeightSlow(weightDst, weightSrc, srcCount, outputCount, 9, unitI, 4, true); // mWeight is used to store 2d-transformed weight @@ -265,23 +288,6 @@ ConvInt83x3::ConvInt83x3(Backend *backend, const MNN::Convolution2D *convParam, return; } } - - const int outputChannleUp4 = ALIGN_UP4(outputCount); - mBiasFloat.reset(Tensor::createDevice({outputChannleUp4})); - auto biasOriginPtr = convParam->symmetricQuan()->bias()->data(); - allocRes = CPUConvolution::acquireMemoryAndCopy(mBiasFloat, biasOriginPtr, outputCount, backend); - if (!allocRes) { - mValid = false; - return; - } - - mScaleFloat.reset(Tensor::createDevice({outputChannleUp4})); - auto scaleOriginData = convParam->symmetricQuan()->scale()->data(); - allocRes = CPUConvolution::acquireMemoryAndCopy(mScaleFloat, scaleOriginData, outputCount, backend); - if (!allocRes) { - mValid = false; - return; - } mRelu = convCommon->relu() || convCommon->relu6(); } diff --git a/source/backend/cpu/compute/ConvInt8_1xN.cpp b/source/backend/cpu/compute/ConvInt8_1xN.cpp index 3bb4059f..628fa377 100644 --- a/source/backend/cpu/compute/ConvInt8_1xN.cpp +++ b/source/backend/cpu/compute/ConvInt8_1xN.cpp @@ -46,7 +46,7 @@ static void MNNTranspose8Bit(int8_t* dstO, const int8_t* srcO, int* dim, int uni namespace MNN { -ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam) : CPUConvolution(convParam->common(), backend) { +ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam, float inputScale, float outputScale) : CPUConvolution(convParam->common(), backend) { const auto convCommon = convParam->common(); const auto kx = convCommon->kernelX(), ky = convCommon->kernelY(); const auto outputCount = convCommon->outputCount(), srcCount = convCommon->inputCount(); @@ -67,14 +67,29 @@ ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam mValid = false; return; } + const int outputChannleUp4 = ALIGN_UP4(outputCount); + mBiasFloat.reset(Tensor::createDevice({outputChannleUp4})); + res = backend->onAcquireBuffer(mBiasFloat.get(), Backend::STATIC); + if (!res) { + mValid = false; + return; + } + mScaleFloat.reset(Tensor::createDevice({outputChannleUp4})); + res = backend->onAcquireBuffer(mScaleFloat.get(), Backend::STATIC); + if (!res) { + mValid = false; + return; + } + auto biasPtr = mBiasFloat->host(); + memset(biasPtr, 0, outputChannleUp4 * sizeof(int32_t)); + auto scalePtr = mScaleFloat->host(); + memset(scalePtr, 0, outputChannleUp4 * sizeof(float)); const int8_t *weightSrc = nullptr; std::shared_ptr quanCommon; - if (convParam->quanParameter() != nullptr) { - quanCommon = ConvolutionCommon::load(convParam->quanParameter(), false); - weightSrc = quanCommon->weight.get(); - } else { - weightSrc = convParam->symmetricQuan()->weight()->data(); + if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, weightSrc, scalePtr, biasPtr, inputScale, outputScale)) { + return; } + auto weightDst = weightInt8->host(); memset(weightDst, 0, weightInt8->size()); CPUConvolution::reorderWeightSlow(weightDst, weightSrc, srcCount, outputCount, mKernelSize, unitI, 4, true); @@ -98,19 +113,6 @@ ConvInt8_1xN::ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convParam backend->onReleaseBuffer(weightInt8.get(), Backend::STATIC); - const int outputChannleUp4 = ALIGN_UP4(outputCount); - mBiasFloat.reset(Tensor::createDevice({outputChannleUp4})); - auto biasOriginPtr = convParam->symmetricQuan()->bias()->data(); - res = res && CPUConvolution::acquireMemoryAndCopy(mBiasFloat, biasOriginPtr, outputCount, backend); - - mScaleFloat.reset(Tensor::createDevice({outputChannleUp4})); - auto scaleOriginData = convParam->symmetricQuan()->scale()->data(); - res = res && CPUConvolution::acquireMemoryAndCopy(mScaleFloat, scaleOriginData, outputCount, backend); - if (!res) { - mValid = false; - return; - } - mRelu = convCommon->relu() || convCommon->relu6(); } diff --git a/source/backend/cpu/compute/ConvInt8_1xN.hpp b/source/backend/cpu/compute/ConvInt8_1xN.hpp index 6b3f7bcc..33585e58 100644 --- a/source/backend/cpu/compute/ConvInt8_1xN.hpp +++ b/source/backend/cpu/compute/ConvInt8_1xN.hpp @@ -14,7 +14,7 @@ namespace MNN { class ConvInt8_1xN : public CPUConvolution { public: - ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convOp); + ConvInt8_1xN(Backend *backend, const MNN::Convolution2D *convOp, float inputScale, float outputScale); virtual ~ConvInt8_1xN(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; diff --git a/source/backend/cpu/compute/ConvOpt.cpp b/source/backend/cpu/compute/ConvOpt.cpp index b03274d6..5f7545c2 100644 --- a/source/backend/cpu/compute/ConvOpt.cpp +++ b/source/backend/cpu/compute/ConvOpt.cpp @@ -13,7 +13,6 @@ #include "math/Vec.hpp" using Vec4 = MNN::Math::Vec; #ifndef MNN_USE_NEON -#ifndef MNN_USE_SSE void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, size_t bStride, size_t height) { @@ -22,9 +21,7 @@ void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size auto b = B + bStride * y; auto c = C + cStride * y; for (int x = 0; x < widthC4; ++x) { - for (int j = 0; j < 4; ++j) { - c[4 * x + j] = a[4 * x + j] - b[4 * x + j]; - } + Vec4::save(c + 4 * x, Vec4::load(a + 4 * x) - Vec4::load(b + 4 * x)); } } } @@ -35,45 +32,10 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size auto b = B + bStride * y; auto c = C + cStride * y; for (int x = 0; x < widthC4; ++x) { - for (int j = 0; j < 4; ++j) { - c[4 * x + j] = a[4 * x + j] + b[4 * x + j]; - } + Vec4::save(c + 4 * x, Vec4::load(a + 4 * x) + Vec4::load(b + 4 * x)); } } } -void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t width, size_t weight_depth_offset) { - int dx, sz, dz; - auto src_depth_step = 4 * width; - for (dz = 0; dz < dst_depth_quad; ++dz) { - float* dst_z = dst + dz * dst_step; - auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset); - for (dx = 0; dx < width; ++dx) { - float* dst_x = dst_z + dx * 4; - dst_x[0] = 0.0f; - dst_x[1] = 0.0f; - dst_x[2] = 0.0f; - dst_x[3] = 0.0f; - const float* src_dx = src + 4 * dx; - for (sz = 0; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - const float* weight_z = weight_dz + sz * 16; - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - dst_x[j] += src_z[i] * weight_z[4 * i + j]; - } - } - } - } - } -} - -void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t weight_depth_offset) { - auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber(); - MNNGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, CONVOLUTION_TILED_NUMBER, - weight_depth_offset); -} void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, @@ -100,7 +62,6 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh } } } -#endif void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { @@ -183,11 +144,6 @@ void MNNConvRunForLineint8_t(float* dst, const int8_t* src, const int8_t* weight } } -void MNNGemmFloatOne_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t weight_depth_offset) { - MNNGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, 1, weight_depth_offset); -} - void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { int fx, fy; @@ -325,8 +281,30 @@ void MNNMatrixMaxCommon(float* C, const float* A, const float* B, size_t width, } } } -#ifndef MNN_USE_SSE -int MNNGetConvolutionTileNumber() { - return 8; +#ifndef MNN_USE_NEON +void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, + size_t eSub, size_t hSub) { + for (int y=0; y #include "core/BufferAllocator.hpp" #include "backend/cpu/CPUBackend.hpp" -#include "CommonOptFunction.h" #include "core/Concurrency.h" #include "ConvOpt.h" #include "core/Macro.h" +#include "CommonOptFunction.h" + namespace MNN { Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight, size_t originWeightSize, const float *bias, size_t biasSize) : CPUConvolution(common, b) { auto outputCount = (int)biasSize; auto mSrcCount = (int)originWeightSize / outputCount; - int ePack, lPack, hPack; - MNNGetMatMulPackMode(&ePack, &lPack, &hPack); mResource.reset(new CPUConvolution::Resource); mResource->backend = b; - mResource->mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputCount, hPack), mSrcCount, hPack})); + if (!mResource->copyBiasAlign(bias, biasSize)) { + MNN_ERROR("Not Enough Memory\n"); + mValid = false; + return; + } + auto core = static_cast(b)->functions(); + int ePack, lPack, hPack; + core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); + mResource->mWeight.reset(Tensor::createDevice(std::vector{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack})); mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); if (!mValid) { MNN_ERROR("Not Enough Memory\n"); return; } - MNNPackForMatMul_B(mResource->mWeight->host(), originWeight, outputCount, mSrcCount, true); - mResource->mBias.reset(Tensor::createDevice(std::vector{UP_DIV((int)biasSize, 4), 4})); - if (!(backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC))) { - MNN_ERROR("Not Enough Memory\n"); - mValid = false; - return; - } - ::memcpy(mResource->mBias->host(), bias, biasSize * sizeof(float)); - auto remain = mResource->mBias->size() - biasSize * sizeof(float); - if (remain > 0) { - ::memset(mResource->mBias->host() + biasSize, 0, remain); + if (core->bytes < 4) { + AutoRelease tempTensor(Tensor::createDevice({outputCount * mSrcCount})); + mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC); + if (!mValid) { + MNN_ERROR("Not Enough Memory\n"); + return; + } + core->MNNFp32ToLowp(originWeight, tempTensor->host(), outputCount * mSrcCount); + core->MNNPackForMatMul_B(mResource->mWeight->host(), tempTensor->host(), outputCount, mSrcCount, true); + b->onReleaseBuffer(tempTensor.get(), Backend::STATIC); + } else { + core->MNNPackForMatMul_B(mResource->mWeight->host(), originWeight, outputCount, mSrcCount, true); } } Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) { @@ -64,22 +72,24 @@ bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, const std::vector &outputs) { CPUConvolution::onResize(inputs, outputs); + auto core = static_cast(backend())->functions(); int ePack, lPack, hPack; - MNNGetMatMulPackMode(&ePack, &lPack, &hPack); - + core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); + int bytes = core->bytes; auto CONVOLUTION_TILED_NUMBER = ePack; auto input = inputs[0]; auto output = outputs[0]; int numberThread = ((CPUBackend *)backend())->threadNumber(); auto ic = input->channel(); - auto icC4 = UP_DIV(ic, 4); - auto ocC4 = UP_DIV(output->channel(), 4); + auto oc = output->channel(); + auto icC4 = UP_DIV(ic, core->pack); + auto ocC4 = UP_DIV(oc, core->pack); auto batch = input->batch(); auto matrixSizeE = output->height() * output->width() * input->batch(); auto outputPlane = output->height() * output->width(); mUnits.clear(); - auto inputPtr = input->host(); - auto outputPtr = output->host(); + auto inputPtr = input->host(); + auto outputPtr = output->host(); mTempOutputBatch.reset(); mTempInputBatch.reset(); std::shared_ptr __autoFunction; @@ -90,15 +100,15 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, mNeedPretreat = input->batch() > 1 || (!(padX == 0 && padY == 0 && strideY == 1 && strideX == 1)); auto postParameters = getPostParameters(); if (mNeedPretreat) { - mTempInputBatch.reset(Tensor::createDevice(std::vector{icC4, matrixSizeE, 4})); - mTempOutputBatch.reset(Tensor::createDevice(std::vector{ocC4, matrixSizeE, 4})); + mTempInputBatch.reset(Tensor::createDevice(std::vector{icC4, matrixSizeE, core->pack})); + mTempOutputBatch.reset(Tensor::createDevice(std::vector{ocC4, matrixSizeE, core->pack})); bool success = backend()->onAcquireBuffer(mTempOutputBatch.get(), Backend::DYNAMIC); success = success && backend()->onAcquireBuffer(mTempInputBatch.get(), Backend::DYNAMIC); if (!success) { return OUT_OF_MEMORY; } - inputPtr = mTempInputBatch->host(); - outputPtr = mTempOutputBatch->host(); + inputPtr = mTempInputBatch->host(); + outputPtr = mTempOutputBatch->host(); __autoFunction = std::shared_ptr(nullptr, [this](void *ptr) { backend()->onReleaseBuffer(mTempOutputBatch.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(mTempInputBatch.get(), Backend::DYNAMIC); @@ -108,32 +118,33 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, auto iw = input->width(); auto ih = input->height(); if (padX == 0 && padY == 0 && strideY == 1 && strideX == 1) { - mPretreatFunction = [outputPlane, icC4, batch, numberThread, this](const float *srcBatch, float *dstBatch) { + mPretreatFunction = [outputPlane, icC4, batch, numberThread, this, core](const uint8_t *srcBatch, uint8_t *dstBatch) { MNN_CONCURRENCY_BEGIN(y, icC4) { - auto srcY = srcBatch + outputPlane * y * 4; - auto dstY = dstBatch + y * outputPlane * batch * 4; + auto srcY = srcBatch + outputPlane * y * core->pack * core->bytes; + auto dstY = dstBatch + y * outputPlane * batch * core->pack * core->bytes; for (int x = 0; x < batch; ++x) { - auto srcX = srcY + x * outputPlane * icC4 * 4; - auto dstX = dstY + x * outputPlane * 4; - ::memcpy(dstX, srcX, outputPlane * 4 * sizeof(float)); + auto srcX = srcY + x * outputPlane * icC4 * core->pack * core->bytes; + auto dstX = dstY + x * outputPlane * core->pack * core->bytes; + ::memcpy(dstX, srcX, outputPlane * core->pack * core->bytes); } } MNN_CONCURRENCY_END(); }; } else if (strideY == 1 && strideX == 1) { - mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this](const float *srcOrigin, - float *dstOrigin) { - ::memset(dstOrigin, 0, outputPlane * batch * sizeof(float) * 4 * icC4); + mPretreatFunction = [outputPlane, padY, padX, ow, oh, iw, ih, icC4, batch, this, core](const uint8_t *srcOrigin, + uint8_t *dstOrigin) { + auto unitBytes = core->bytes * core->pack; + ::memset(dstOrigin, 0, outputPlane * batch * unitBytes * icC4); MNN_CONCURRENCY_BEGIN(z, icC4) { - auto srcZ = srcOrigin + z * iw * ih * 4; - auto dstZ = dstOrigin + z * ow * oh * batch * 4; + auto srcZ = srcOrigin + z * iw * ih * unitBytes; + auto dstZ = dstOrigin + z * ow * oh * batch * unitBytes; for (int b = 0; b < batch; ++b) { - auto srcBatch = srcZ + b * iw * ih * icC4 * 4; - auto dstBatch = dstZ + b * ow * oh * 4; + auto srcBatch = srcZ + b * iw * ih * icC4 * unitBytes; + auto dstBatch = dstZ + b * ow * oh * unitBytes; for (int y = 0; y < ih; ++y) { - auto src = srcBatch + iw * y * 4; - auto dst = dstBatch + (ow * (y + padY) + padX) * 4; - ::memcpy(dst, src, iw * 4 * sizeof(float)); + auto src = srcBatch + iw * y * unitBytes; + auto dst = dstBatch + (ow * (y + padY) + padX) * unitBytes; + ::memcpy(dst, src, iw * unitBytes); } } } @@ -156,22 +167,22 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, int oyCount = oyEnd - oyStart + 1; int oxCount = oxEnd - oxStart + 1; mPretreatFunction = [outputPlane, padY, padX, strideX, strideY, ow, oh, iw, ih, icC4, oxStart, oyStart, - oxCount, oyCount, batch, this](const float *srcOrigin, float *dstOrigin) { - ::memset(dstOrigin, 0, outputPlane * batch * sizeof(float) * 4 * icC4); - auto srcStride = strideX * 4; - auto dstStride = 4; + oxCount, oyCount, batch, this, core](const uint8_t *srcOrigin, uint8_t *dstOrigin) { + ::memset(dstOrigin, 0, outputPlane * batch * core->bytes * core->pack * icC4); + auto srcStride = strideX; + auto dstStride = 1; int syStart = oyStart * strideY - padY; int sxStart = oxStart * strideX - padX; MNN_CONCURRENCY_BEGIN(z, icC4) { - auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * 4; - auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * 4; + auto srcZ = srcOrigin + (z * iw * ih + syStart * iw + sxStart) * core->bytes * core->pack; + auto dstZ = dstOrigin + (z * ow * oh * batch + oyStart * ow + oxStart) * core->bytes * core->pack; for (int b = 0; b < batch; ++b) { - auto srcBatch = srcZ + b * iw * ih * icC4 * 4; - auto dstBatch = dstZ + b * ow * oh * 4; + auto srcBatch = srcZ + b * iw * ih * icC4 * core->bytes * core->pack; + auto dstBatch = dstZ + b * ow * oh * core->bytes * core->pack; for (int y = 0; y < oyCount; ++y) { - auto dstY = dstBatch + y * ow * 4; - auto srcY = srcBatch + y * strideY * iw * 4; - MNNCopyC4WithStride(srcY, dstY, srcStride, dstStride, oxCount); + auto dstY = dstBatch + y * ow * core->bytes * core->pack; + auto srcY = srcBatch + y * strideY * iw * core->bytes * core->pack; + core->MNNCopyC4WithStride((const float*)(srcY), (float*)(dstY), strideX * core->pack, core->pack, oxCount); } } } @@ -183,6 +194,13 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, memoryPool->barrierBegin(); std::shared_ptr __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); }); int maxDepth = 5; + auto icAlign = UP_DIV(ic, lPack) * lPack; + auto weightTensor = mResource->mWeight.get(); + AutoRelease tempWeight; + if (icAlign != ic) { + tempWeight.reset(Tensor::create(std::vector{oc, ic, hPack}, mResource->mWeight->host())); + weightTensor = tempWeight.get(); + } if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) { // Divide in plane, in this case the divide equal numberThread int divideStep = UP_DIV(matrixSizeE, numberThread); @@ -197,25 +215,26 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, continue; } unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth)); - unit.mTempInput.reset( - Tensor::create(std::vector{icC4, planeSize, 4}, inputPtr + 4 * planeStart)); - unit.mTempInput->setStride(0, matrixSizeE * 4); - unit.mTempOutput.reset( - Tensor::create(std::vector{ocC4, planeSize, 4}, outputPtr + 4 * planeStart)); - unit.mTempOutput->setStride(0, matrixSizeE * 4); - unit.mTempInputVector = std::vector{unit.mTempInput.get(), mResource->mWeight.get(), mResource->mBias.get()}; - unit.mTempOutputVector = std::vector{unit.mTempOutput.get()}; + AutoRelease mTempInput( + Tensor::create(std::vector{icC4, planeSize, core->pack}, inputPtr + core->pack * planeStart * bytes)); + mTempInput->setStride(0, matrixSizeE * core->pack); + AutoRelease mTempOutput( + Tensor::create(std::vector{ocC4, planeSize, core->pack}, outputPtr + core->pack * planeStart * bytes)); + mTempOutput->setStride(0, matrixSizeE * core->pack); + unit.mTempInputVector = std::vector{mTempInput.get(), weightTensor, mResource->mBias.get()}; + unit.mTempOutputVector = std::vector{mTempOutput.get()}; memoryPool->beginGroup(); - std::shared_ptr __b(nullptr, [memoryPool](void *) { memoryPool->endGroup(); }); unit.mStracssenComputor->onReset(); auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters); if (NO_ERROR != code) { + memoryPool->endGroup(); return code; } + memoryPool->endGroup(); } } else { // Divide in ocC4 - auto hDiv = MNNGetC4DivNumber(hPack); + auto hDiv = hPack / core->pack; auto ocDiv = UP_DIV(ocC4, hDiv); numberThread = std::min(numberThread, ocDiv); int divideStep = (ocDiv / numberThread) * hDiv; @@ -231,24 +250,25 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, unit.mValid = false; continue; } - auto ocStartWeight = (ocStart * 4) / hPack; - auto ocWeightSize = std::min(UP_DIV((ocSize * 4), hPack), mResource->mWeight->length(0) - ocStartWeight); + auto ocStartWeight = (ocStart * core->pack) / hPack; + auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight); unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth)); - unit.mTempInput.reset(Tensor::create(std::vector{icC4, matrixSizeE, 4}, inputPtr)); - unit.mTempBias.reset(Tensor::create({ocSize, 1, 4}, mResource->mBias->host() + 4 * ocStart)); - unit.mTempOutput.reset( - Tensor::create(std::vector{ocSize, matrixSizeE, 4}, outputPtr + 4 * matrixSizeE * ocStart)); - unit.mTempWeight.reset(Tensor::create(std::vector{ocWeightSize, ic, hPack}, - mResource->mWeight->host() + hPack * ic * ocStartWeight)); - unit.mTempInputVector = std::vector{unit.mTempInput.get(), unit.mTempWeight.get(), unit.mTempBias.get()}; - unit.mTempOutputVector = std::vector{unit.mTempOutput.get()}; + AutoRelease mTempInput(Tensor::create(std::vector{icC4, matrixSizeE, core->pack}, inputPtr)); + AutoRelease mTempBias(Tensor::create({ocSize, 1, core->pack}, mResource->mBias->host() + core->pack * ocStart * bytes)); + AutoRelease mTempOutput( + Tensor::create(std::vector{ocSize, matrixSizeE, core->pack}, outputPtr + core->pack * matrixSizeE * ocStart * bytes)); + AutoRelease mTempWeight(Tensor::create(std::vector{ocWeightSize, ic, hPack}, + mResource->mWeight->host() + hPack * icAlign * ocStartWeight * bytes)); + unit.mTempInputVector = std::vector{mTempInput.get(), mTempWeight.get(), mTempBias.get()}; + unit.mTempOutputVector = std::vector{mTempOutput.get()}; memoryPool->beginGroup(); - std::shared_ptr __b(nullptr, [memoryPool](void *) { memoryPool->endGroup(); }); unit.mStracssenComputor->onReset(); auto code = unit.mStracssenComputor->onEncode(unit.mTempInputVector, unit.mTempOutputVector, postParameters); if (NO_ERROR != code) { + memoryPool->endGroup(); return code; } + memoryPool->endGroup(); } } return NO_ERROR; @@ -258,7 +278,8 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector &inputs, auto size = mUnits.size(); auto input = inputs[0]; auto output = outputs[0]; - + auto core = static_cast(backend())->functions(); + if (!mNeedPretreat) { MNN_CONCURRENCY_BEGIN(tId, size) { auto &unit = mUnits[tId]; @@ -269,7 +290,8 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector &inputs, MNN_CONCURRENCY_END(); return NO_ERROR; } - mPretreatFunction(input->host(), mTempInputBatch->host()); + int bytes = core->bytes; + mPretreatFunction(input->host(), mTempInputBatch->host()); MNN_CONCURRENCY_BEGIN(tId, size) { auto &unit = mUnits[tId]; if (unit.mValid) { @@ -280,14 +302,14 @@ ErrorCode Convolution1x1Strassen::onExecute(const std::vector &inputs, auto batch = input->batch(); auto outputPlane = output->height() * output->width(); - auto ocC4 = UP_DIV(output->channel(), 4); + auto ocC4 = UP_DIV(output->channel(), core->pack); MNN_CONCURRENCY_BEGIN(y, ocC4) { - auto srcY = mTempOutputBatch->host() + outputPlane * y * 4 * batch; - auto dstY = output->host() + y * outputPlane * 4; + auto srcY = mTempOutputBatch->host() + outputPlane * y * core->pack * batch * bytes; + auto dstY = output->host() + y * outputPlane * core->pack * bytes; for (int x = 0; x < batch; ++x) { - auto srcX = srcY + x * outputPlane * 4; - auto dstX = dstY + x * outputPlane * ocC4 * 4; - ::memcpy(dstX, srcX, outputPlane * 4 * sizeof(float)); + auto srcX = srcY + x * outputPlane * core->pack * bytes; + auto dstX = dstY + x * outputPlane * ocC4 * core->pack * bytes; + ::memcpy(dstX, srcX, outputPlane * core->pack * bytes); } } MNN_CONCURRENCY_END(); diff --git a/source/backend/cpu/compute/Convolution1x1Strassen.hpp b/source/backend/cpu/compute/Convolution1x1Strassen.hpp index cd8670a6..1cbec976 100644 --- a/source/backend/cpu/compute/Convolution1x1Strassen.hpp +++ b/source/backend/cpu/compute/Convolution1x1Strassen.hpp @@ -29,10 +29,6 @@ private: struct Unit { bool mValid = true; - std::shared_ptr mTempBias; - std::shared_ptr mTempInput; - std::shared_ptr mTempWeight; - std::shared_ptr mTempOutput; std::vector mTempInputVector; std::vector mTempOutputVector; std::shared_ptr mStracssenComputor; @@ -42,7 +38,7 @@ private: std::shared_ptr mTempInputBatch; std::shared_ptr mTempOutputBatch; bool mNeedPretreat = false; - std::function mPretreatFunction; + std::function mPretreatFunction; }; } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp index 8eab0a13..91b3ad86 100644 --- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp +++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp @@ -8,194 +8,15 @@ #include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp" #include "backend/cpu/CPUBackend.hpp" +#include "CommonOptFunction.h" #include "core/Concurrency.h" #include "core/Macro.h" -#include "math/Vec.hpp" - -using Vec4 = MNN::Math::Vec; -extern "C" { -void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow); -void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit); -} -static void _multiAndDestTransformCommon(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, - int ow) { - int unit = ow / 2; - MNN_ASSERT(cacheLineSize >= 1); - for (int x = 0; x < unit; ++x) { - auto offset = 4 * 4 * x; - int i = 0; - Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - Vec4 m3 = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3); - - for (i = 1; i < cacheLineSize; ++i) { - m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3); - } - - auto o0 = m0 + m1 + m2; - auto o1 = m1 - m2 + m3; - Vec4::save(dest + 8 * x + 0 * 4, o0); - Vec4::save(dest + 8 * x + 1 * 4, o1); - } - if (unit * 2 < ow) { - auto offset = 4 * 4 * unit; - int i = 0; - Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - - for (i = 1; i < cacheLineSize; ++i) { - m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - } - - auto o0 = m0 + m1 + m2; - Vec4::save(dest + 8 * unit + 0 * 4, o0); - } -} - -static void _sourceTransformCommon(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) { - for (int x = 0; x < su; ++x) { - auto dstX = dest + 4 * 4 * x; - auto sx = x * 2 - (int)pad; - auto ex = sx + 4; - - auto clampSx = std::max(sx, 0); - auto clampEx = std::min(ex, (int)iw); - - Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = clampSx; i < clampEx; ++i) { - v[i - sx] = Vec4::load(source + 4 * i); - } - auto m0 = v[0] - v[2]; - auto m1 = v[1] + v[2]; - auto m2 = v[2] - v[1]; - auto m3 = v[3] - v[1]; - - Vec4::save(dstX + 4 * 0, m0); - Vec4::save(dstX + 4 * 1, m1); - Vec4::save(dstX + 4 * 2, m2); - Vec4::save(dstX + 4 * 3, m3); - } - MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su); - - for (int x = eu; x < unit; ++x) { - auto dstX = dest + 4 * 4 * x; - auto sx = x * 2 - (int)pad; - auto ex = sx + 4; - - auto clampSx = std::max(sx, 0); - auto clampEx = std::min(ex, (int)iw); - - Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = clampSx; i < clampEx; ++i) { - v[i - sx] = Vec4::load(source + 4 * i); - } - auto m0 = v[0] - v[2]; - auto m1 = v[1] + v[2]; - auto m2 = v[2] - v[1]; - auto m3 = v[3] - v[1]; - - Vec4::save(dstX + 4 * 0, m0); - Vec4::save(dstX + 4 * 1, m1); - Vec4::save(dstX + 4 * 2, m2); - Vec4::save(dstX + 4 * 3, m3); - } -} - -#ifndef MNN_USE_NEON -void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow) { - int unit = ow / 2; - auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0); - auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1); - auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2); - auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3); - auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0); - auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1); - auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2); - auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3); - auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0); - auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1); - auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2); - auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3); - for (int x = 0; x < unit; ++x) { - auto offset = 4 * 4 * x; - int i = 0; - Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0); - Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1); - Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2); - Vec4 m3 = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3); - - m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0); - m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1); - m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2); - m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3); - - m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0); - m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1); - m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2); - m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3); - - auto o0 = m0 + m1 + m2; - auto o1 = m1 - m2 + m3; - Vec4::save(dest + 8 * x + 0 * 4, o0); - Vec4::save(dest + 8 * x + 1 * 4, o1); - } - if (unit * 2 < ow) { - auto offset = 4 * 4 * unit; - Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0); - Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1); - Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2); - - m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0); - m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1); - m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2); - - m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0); - m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1); - m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2); - auto o0 = m0 + m1 + m2; - Vec4::save(dest + 8 * unit + 0 * 4, o0); - } -} -void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) { - if (unit <= 0) { - return; - } - Vec4 v0 = Vec4::load(source + 4 * 0); - Vec4 v1 = Vec4::load(source + 4 * 1); - Vec4 v2; - Vec4 v3; - source += 8; - - for (int x = 0; x < unit; ++x) { - v2 = Vec4::load(source + 0 * 4); - v3 = Vec4::load(source + 1 * 4); - auto m0 = v0 - v2; - auto m1 = v1 + v2; - auto m2 = v2 - v1; - auto m3 = v3 - v1; - - Vec4::save(dest + 4 * 0, m0); - Vec4::save(dest + 4 * 1, m1); - Vec4::save(dest + 4 * 2, m2); - Vec4::save(dest + 4 * 3, m3); - - source += 8; - dest += 16; - - v0 = v2; - v1 = v3; - } -} -#endif namespace MNN { +ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) { + mResource = resource; +} + ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight, size_t originWeightSize, const float *bias, size_t biasSize) @@ -203,30 +24,41 @@ ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *comm MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY()); MNN_ASSERT(1 == common->strideX() && 1 == common->strideY()); MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY()); - mBias.reset(Tensor::createDevice({(int)ALIGN_UP4(biasSize)})); - mValid = backend()->onAcquireBuffer(mBias.get(), Backend::STATIC); - if (!mValid) { - MNN_ERROR("Error for alloc memory in ConvolutionDepthwise3x3\n"); + mResource.reset(new Resource); + mResource->backend = b; + auto core = static_cast(b)->functions(); + auto pack = core->pack; + auto bytes = core->bytes; + auto success = mResource->copyBiasAlign(bias, biasSize); + if (!success) { + mValid = false; return; } - ::memset(mBias->host(), 0, mBias->size()); - ::memcpy(mBias->host(), bias, biasSize * sizeof(float)); auto channel = common->outputCount(); - auto channelC4 = UP_DIV(channel, 4); - mWeight.reset(Tensor::createDevice({channelC4, 3, 4, 4})); - mValid = backend()->onAcquireBuffer(mWeight.get(), Backend::STATIC); + auto channelC4 = UP_DIV(channel, pack); + auto unitSize = channelC4 * pack * 3 * 4; + mResource->mWeight.reset(Tensor::createDevice({unitSize * bytes})); + mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); if (!mValid) { - MNN_ERROR("Error for alloc memory in ConvolutionDepthwise3x3\n"); return; } - auto weightHost = mWeight->host(); - ::memset(weightHost, 0, mWeight->size()); - + AutoStorage tempWeightStorge; + auto weightHost = mResource->mWeight->host(); + if (bytes < 4) { + // Lowp need extra float storage for transform + tempWeightStorge.reset(unitSize); + if (nullptr == tempWeightStorge.get()) { + mValid = false; + return; + } + weightHost = tempWeightStorge.get(); + } + ::memset(weightHost, 0, unitSize * sizeof(float)); /* 1D-Winograd F(2,3) and tiling */ for (int c = 0; c < channel; ++c) { - auto cIndex = c / 4; - auto cRemain = c % 4; - auto weightDstZ = weightHost + cIndex * 4 * 4 * 3 + cRemain; + auto cIndex = c / pack; + auto cRemain = c % pack; + auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain; auto weightSrcZ = originWeight + c * 9; for (int y = 0; y < 3; ++y) { auto k0 = weightSrcZ[3 * y + 0]; @@ -238,21 +70,28 @@ ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *comm auto m2 = 0.5f * (k0 - k1 + k2); auto m3 = k2; - weightDstZ[y * 16 + 4 * 0] = m0; - weightDstZ[y * 16 + 4 * 1] = m1; - weightDstZ[y * 16 + 4 * 2] = m2; - weightDstZ[y * 16 + 4 * 3] = m3; + weightDstZ[(y * 4 + 0) * pack] = m0; + weightDstZ[(y * 4 + 1) * pack] = m1; + weightDstZ[(y * 4 + 2) * pack] = m2; + weightDstZ[(y * 4 + 3) * pack] = m3; } } + if (bytes < 4) { + core->MNNFp32ToLowp(weightHost, mResource->mWeight->host(), unitSize); + } } ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() { - if (nullptr != mBias) { - backend()->onReleaseBuffer(mBias.get(), Backend::STATIC); - } - if (nullptr != mWeight) { - backend()->onReleaseBuffer(mWeight.get(), Backend::STATIC); + // Do nothing +} + +bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) { + if (nullptr == dst) { + return true; } + auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn); + *dst = dstExe; + return true; } ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector &inputs, const std::vector &outputs) { @@ -260,8 +99,9 @@ ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector &inputs, int numberThread = ((CPUBackend *)backend())->threadNumber(); auto output = outputs[0]; auto owUnit = UP_DIV(output->width(), 2); - // 3 cacheline, 4 is the unit of transform - mCacheLine.reset(Tensor::createDevice({numberThread, 3, owUnit * 4, 4})); + auto core = static_cast(backend())->functions(); + // 3 cacheline + mCacheLine.reset(Tensor::createDevice({numberThread, 3 * 4 * owUnit * core->pack * core->bytes})); auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC); if (!valid) { return OUT_OF_MEMORY; @@ -270,7 +110,7 @@ ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector &inputs, auto iw = inputs[0]->width(); mSourceStartX = UP_DIV(mPadX, 2); mSourceEndX = std::max((iw + mPadX - 4) / 2, mSourceStartX); - + mPostParameters = getPostParameters(); // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit; // FUNC_PRINT_ALL(rate, f); return NO_ERROR; @@ -280,7 +120,9 @@ ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector &inputs const std::vector &outputs) { auto input = inputs[0]; auto output = outputs[0]; - int channelC4 = UP_DIV(input->channel(), 4); + auto core = static_cast(backend())->functions(); + + int channelC4 = UP_DIV(input->channel(), core->pack); int initSize = std::min(input->height(), 2); int batch = input->batch(); int ow = output->width(); @@ -289,7 +131,7 @@ ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector &inputs auto iw = input->width(); auto ih = input->height(); - auto kernelOrigin = mWeight->host(); + auto kernelOrigin = mResource->mWeight->host(); /*oy-mPadY>=0*/ int middelYStart = mPadY; @@ -299,72 +141,70 @@ ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector &inputs int threadNumber = ((CPUBackend *)backend())->threadNumber(); auto maxKernelH = std::min(mPadY + ih, 3); + auto total = channelC4 * batch; + auto inputOrigin = input->host(); + auto outputOrigin = output->host(); + MNN_CONCURRENCY_BEGIN(tId, threadNumber) { + auto cacheLineStart = mCacheLine->host() + tId * mCacheLine->stride(0); + for (int index = (int)tId; index < total; index += threadNumber) { + int z = index % channelC4; + auto inputZ = inputOrigin + core->pack * index * iw * ih * core->bytes; + auto outputZ = outputOrigin + core->pack * index * ow * oh * core->bytes; + auto kernelZ = kernelOrigin + z * core->pack * core->bytes * 4 * 3; + auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0; + auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1; + auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2; - for (int batchIndex = 0; batchIndex < batch; ++batchIndex) { - auto inputOrigin = input->host() + batchIndex * input->stride(0); - auto outputOrigin = output->host() + batchIndex * output->stride(0); - MNN_CONCURRENCY_BEGIN(tId, threadNumber) { - auto cacheLineStart = mCacheLine->host() + tId * mCacheLine->stride(0); - for (int z = (int)tId; z < channelC4; z += threadNumber) { - auto inputZ = inputOrigin + 4 * z * iw * ih; - auto outputZ = outputOrigin + 4 * z * ow * oh; - auto kernelZ = kernelOrigin + z * mWeight->stride(0); - auto cacheLine0 = cacheLineStart + 16 * owUnit * 0; - auto cacheLine1 = cacheLineStart + 16 * owUnit * 1; - auto cacheLine2 = cacheLineStart + 16 * owUnit * 2; + float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2}; - float *cacheLine[3] = {cacheLine0, cacheLine1, cacheLine2}; - - // Init - for (int i = 0; i < initSize; ++i) { - _sourceTransformCommon(inputZ + i * iw * 4, cacheLine[i], owUnit, iw, mPadX, mSourceStartX, - mSourceEndX); - } - - // Compute Top - for (int y = 0; y < middelYStart; ++y) { - auto outputY = outputZ + y * 4 * ow; - int cacheLineSize = y - mPadY + maxKernelH; - if (cacheLineSize <= 0) { - ::memset(outputY, 0, 4 * ow * sizeof(float)); - continue; - } - auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 16; - _multiAndDestTransformCommon(cacheLine, kernelPtr, outputY, cacheLineSize, ow); - } - - // Compute Mid - for (int y = middelYStart; y < middelYEnd; ++y) { - auto outputY = outputZ + y * 4 * ow; - auto iy = y - mPadY + 2; - _sourceTransformCommon(inputZ + 4 * iy * iw, cacheLine[2], owUnit, iw, mPadX, mSourceStartX, - mSourceEndX); - // FUNC_PRINT(ow); - MNNConvDwF23MulTransUnit(cacheLine, kernelZ, outputY, ow); - - auto temp = cacheLine[0]; - cacheLine[0] = cacheLine[1]; - cacheLine[1] = cacheLine[2]; - cacheLine[2] = temp; - } - - // Compute Bottom - for (int y = middelYEnd; y < oh; ++y) { - auto outputY = outputZ + y * 4 * ow; - int cacheLineSize = (ih - y + mPadY); - if (cacheLineSize <= 0) { - ::memset(outputY, 0, 4 * ow * sizeof(float)); - continue; - } - _multiAndDestTransformCommon(cacheLine, kernelZ, outputY, cacheLineSize, ow); - cacheLine[0] = cacheLine[1]; - cacheLine[1] = cacheLine[2]; - } - mPostFunction(outputZ, mBias->host() + 4 * z, ow * oh, 1); + // Init + for (int i = 0; i < initSize; ++i) { + core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX, + mSourceEndX); } + + // Compute Top + for (int y = 0; y < middelYStart; ++y) { + auto outputY = outputZ + y * core->bytes * core->pack * ow; + int cacheLineSize = y - mPadY + maxKernelH; + if (cacheLineSize <= 0) { + ::memset(outputY, 0, core->bytes * ow * core->pack); + continue; + } + auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes; + core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow); + } + + // Compute Mid + for (int y = middelYStart; y < middelYEnd; ++y) { + auto outputY = outputZ + y * core->bytes * core->pack * ow; + auto iy = y - mPadY + 2; + core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX, + mSourceEndX); + // FUNC_PRINT(ow); + core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow); + + auto temp = cacheLine[0]; + cacheLine[0] = cacheLine[1]; + cacheLine[1] = cacheLine[2]; + cacheLine[2] = temp; + } + + // Compute Bottom + for (int y = middelYEnd; y < oh; ++y) { + auto outputY = outputZ + y * core->bytes * core->pack * ow; + int cacheLineSize = (ih - y + mPadY); + if (cacheLineSize <= 0) { + ::memset(outputY, 0, ow * core->bytes * core->pack); + continue; + } + core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow); + cacheLine[0] = cacheLine[1]; + cacheLine[1] = cacheLine[2]; + } + core->MNNAxByClampBroadcastUnit((float*)outputZ, (float*)outputZ, (float*)(mResource->mBias->host() + core->bytes * core->pack * z), ow * oh, 0, 0, 1, mPostParameters.data()); } - MNN_CONCURRENCY_END(); - } + } MNN_CONCURRENCY_END(); return NO_ERROR; } } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp index e6630e8e..319021bb 100644 --- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp +++ b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp @@ -20,14 +20,16 @@ public: virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - + virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; private: - std::unique_ptr mWeight; - std::unique_ptr mBias; + ConvolutionDepthwise3x3(std::shared_ptr resource, const Convolution2DCommon* common, Backend* b); + + std::shared_ptr mResource; std::unique_ptr mCacheLine; int mSourceStartX = 0; int mSourceEndX = 0; + std::vector mPostParameters; }; } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp index 4797bd0d..c48db085 100644 --- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp +++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp @@ -22,10 +22,10 @@ namespace MNN { static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend, const Convolution2DCommon* common, const float* originWeight, size_t originWeightSize, const float* bias, size_t biasSize) { + auto layer = common; #ifdef MNN_USE_ONEDNN return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize); #endif - auto layer = common; bool fastWay = layer->kernelY() == 1 && layer->kernelX() == 1; if (fastWay) { return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize); @@ -37,7 +37,7 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) { return new ConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize); } - auto unit = ConvolutionWinograd::bestWinogradUnit(common, input, output, cpuBackend->threadNumber()); + auto unit = ConvolutionWinograd::bestWinogradUnit(common, input, output, cpuBackend->threadNumber(), backend); if (unit <= 1) { return new ConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize); } @@ -69,7 +69,12 @@ Execution* ConvolutionFloatFactory::create(const std::vector& inputs, c MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str()); return nullptr; } + if (quanCommon->weightFloat.get() == nullptr) { + if (backend->type() != MNN_FORWARD_CPU) { + // From BF16 + return nullptr; + } return ConvolutionIntFactory::create(inputs[0], outputs[0], op, backend, quanCommon.get()); } // Back to float diff --git a/source/backend/cpu/compute/ConvolutionGroup.cpp b/source/backend/cpu/compute/ConvolutionGroup.cpp index d36a900e..f21c7b86 100644 --- a/source/backend/cpu/compute/ConvolutionGroup.cpp +++ b/source/backend/cpu/compute/ConvolutionGroup.cpp @@ -72,28 +72,28 @@ ErrorCode ConvolutionGroup::onExecute(const std::vector &inputs, const auto input = inputs[0]; auto output = outputs[0]; int batch = input->buffer().dim[0].extent; - auto inputBatchSize = input->width() * input->height() * ALIGN_UP4(input->channel()); - auto outputBatchSize = output->width() * output->height() * ALIGN_UP4(output->channel()); + auto core = static_cast(backend())->functions(); + auto inputBatchSize = input->width() * input->height() * UP_DIV(input->channel(), core->pack) * core->pack; + auto outputBatchSize = output->width() * output->height() * UP_DIV(output->channel(), core->pack) * core->pack; for (int b = 0; b < batch; ++b) { - auto srcOrigin = input->host() + b * inputBatchSize; - auto dstOrigin = output->host() + b * outputBatchSize; + auto srcOrigin = input->host() + b * inputBatchSize * core->bytes; + auto dstOrigin = output->host() + b * outputBatchSize * core->bytes; - MNNUnpackC4(mInputRaw->host(), srcOrigin, input->width() * input->height(), input->channel()); + core->MNNUnpackCUnit(mInputRaw->host(), (float*)srcOrigin, input->width() * input->height(), input->channel()); int inputGroupSize = input->width() * input->height() * input->channel() / mSubConvolution.size(); int outputGroupSize = output->width() * output->height() * output->channel() / mSubConvolution.size(); int subInputChannel = input->channel() / mSubConvolution.size(); int subOutputChannel = output->channel() / mSubConvolution.size(); for (int group = 0; group < mSubConvolution.size(); ++group) { - MNNPackC4(mInputUnit->host(), mInputRaw->host() + group * inputGroupSize, + core->MNNPackCUnit(mInputUnit->host(), (const float*)(mInputRaw->host() + group * inputGroupSize * core->bytes), input->width() * input->height(), subInputChannel); mSubConvolution[group]->onExecute(mInputUnitWrap, mOutputUnitWrap); - MNNUnpackC4(mOutputRaw->host() + group * outputGroupSize, mOutputUnit->host(), + core->MNNUnpackCUnit((float*)(mOutputRaw->host() + group * outputGroupSize * core->bytes), mOutputUnit->host(), output->width() * output->height(), subOutputChannel); } - MNNPackC4(dstOrigin, mOutputRaw->host(), output->width() * output->height(), output->channel()); + core->MNNPackCUnit((float*)dstOrigin, mOutputRaw->host(), output->width() * output->height(), output->channel()); } - return NO_ERROR; } } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp b/source/backend/cpu/compute/ConvolutionInt8Executor.cpp index edbfee98..6e453b68 100644 --- a/source/backend/cpu/compute/ConvolutionInt8Executor.cpp +++ b/source/backend/cpu/compute/ConvolutionInt8Executor.cpp @@ -132,6 +132,7 @@ ErrorCode ConvolutionInt8Executor::onResize(const std::vector& inputs, backend()->onReleaseBuffer(&mTempDstBuffer, Backend::DYNAMIC); backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC); + mPostParameters = getPostParameters(); return NO_ERROR; } @@ -352,7 +353,7 @@ ErrorCode ConvolutionInt8Executor::onExecute(const std::vector& inputs, for (int z = (int)tId; z < ocC4; z += threadNumber) { MNNScaleAndAddBias(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + 4 * z, mAlpha.get() + 4 * z, width * height, 1); - mPostFunction(dstOrigin + z * dstZStep, mBias.get() + 4 * z, width * height, 1); + MNNAxByClampBroadcastUnit(dstOrigin + z * dstZStep, dstOrigin + z * dstZStep, mBias.get() + 4 * z, width * height, 0, 0, 1, mPostParameters.data()); } } MNN_CONCURRENCY_END(); diff --git a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp b/source/backend/cpu/compute/ConvolutionInt8Executor.hpp index 697b6598..c9b465ee 100644 --- a/source/backend/cpu/compute/ConvolutionInt8Executor.hpp +++ b/source/backend/cpu/compute/ConvolutionInt8Executor.hpp @@ -38,6 +38,7 @@ private: float mAMin; float mAMax; float mQuanScale; + std::vector mPostParameters; }; } // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp index 3b426a70..83ed4517 100644 --- a/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.cpp @@ -15,10 +15,11 @@ #include "core/Macro.h" #include "core/TensorUtils.hpp" #include "math/Vec.hpp" +#include "core/BufferAllocator.hpp" using Vec4 = MNN::Math::Vec; namespace MNN { -static void _initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize) { +static void _initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function) { // Swap k, ic int dims[4] = { depth, @@ -31,36 +32,39 @@ static void _initWeight(float *dest, const float *source, float* cache, int dept auto sO = source + o * depth * kernelSize; MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]); } - MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true); + if (function->bytes < 4) { + // Lowp + function->MNNFp32ToLowp((float*)cache, (int16_t*)cache, outputCount * kernelSize * depth); + } + function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true); } ConvolutionTiledExecutor::ConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b, const float* originWeight, size_t originWeightSize, const float* bias, size_t biasSize) : MNN::Execution(b) { auto outputCount = (int)biasSize; - int eP, lP, hP; - MNNGetMatMulPackMode(&eP, &lP, &hP); mResource.reset(new CPUConvolution::Resource); mResource->backend = b; - + int eP, lP, hP; + auto core = static_cast(b)->functions(); + int bytes = core->bytes; + core->MNNGetMatMulPackMode(&eP, &lP, &hP); // Don't use common->inputCount for old model common->inputCount is zero auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY(); - mResource->mWeight.reset(Tensor::createDevice( - {UP_DIV(outputCount, hP), UP_DIV(srcCount, 4), (int)common->kernelX(), common->kernelY(), 4 * hP})); - std::shared_ptr cache(Tensor::createDevice({outputCount, srcCount * common->kernelX() * common->kernelY()})); + auto lSize = srcCount * common->kernelX() * common->kernelY(); + mResource->mWeight.reset(Tensor::createDevice( + {UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes})); + std::shared_ptr cache(Tensor::createDevice({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC) && backend()->onAcquireBuffer(cache.get(), Backend::STATIC); if (!mValid) { return; } - _initWeight(mResource->mWeight->host(), originWeight, cache->host(), srcCount, outputCount, common->kernelX() * common->kernelY()); + _initWeight(mResource->mWeight->host(), originWeight, cache->host(), srcCount, outputCount, common->kernelX() * common->kernelY(), core); backend()->onReleaseBuffer(cache.get(), Backend::STATIC); - mResource->mBias.reset(Tensor::createDevice({ALIGN_UP4((int)biasSize)})); - mValid = backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC); + mValid = mResource->copyBiasAlign(bias, biasSize); if (!mValid) { return; } - ::memset(mResource->mBias->host(), 0, mResource->mBias->size()); - ::memcpy(mResource->mBias->host(), bias, biasSize * sizeof(float)); mProxy.reset(new ConvolutionTiledExecutorBasic(common, b)); } @@ -89,6 +93,14 @@ ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector& in auto input = inputs[0]; auto weight = inputs[1]; Tensor* bias = nullptr; + auto core = static_cast(backend())->functions(); + int bytes = core->bytes; + int unit = core->pack; + auto packA = core->MNNPackC4ForMatMul_A; + auto matmulUnit = core->MNNPackedMatMul; + auto matmulRemain = core->MNNPackedMatMulRemain; + int eP, lP, hP; + core->MNNGetMatMulPackMode(&eP, &lP, &hP); const float* biasPtr = nullptr; if (inputs.size() > 2) { bias = inputs[2]; @@ -101,57 +113,44 @@ ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector& in auto weightPtr = weight->host(); auto src_width = input->width(); auto src_height = input->height(); - int src_z_step = input->width() * input->height() * 4; - int eP, lP, hP; - MNNGetMatMulPackMode(&eP, &lP, &hP); + int src_z_step = input->width() * input->height() * unit; auto CONVOLUTION_TILED_NUMBER = eP; - auto& tempBuffer = mTempBuffer.buffer(); - auto icC4 = UP_DIV(input->channel(), 4); + auto icC4 = UP_DIV(input->channel(), unit); auto ic = input->channel(); - auto L = input->channel() * mCommon->kernelY() * mCommon->kernelX(); + auto L = ic * mCommon->kernelY() * mCommon->kernelX(); auto kernelSize = mCommon->kernelX() * mCommon->kernelY(); - tempBuffer.dim[0].extent = threadNumber; - tempBuffer.dim[1].extent = CONVOLUTION_TILED_NUMBER; - tempBuffer.dim[2].extent = icC4 * mCommon->kernelY() * mCommon->kernelX(); // srcCount * kx*ky - tempBuffer.dim[3].extent = 4; - TensorUtils::setLinearLayout(&mTempBuffer); - + mTempBufferTranspose.buffer().type = halide_type_of(); mTempBufferTranspose.buffer().dimensions = 2; mTempBufferTranspose.buffer().dim[0].extent = threadNumber; - mTempBufferTranspose.buffer().dim[1].extent = L * CONVOLUTION_TILED_NUMBER; + mTempBufferTranspose.buffer().dim[1].extent = UP_DIV(L, lP) * lP * CONVOLUTION_TILED_NUMBER * bytes; TensorUtils::setLinearLayout(&mTempBufferTranspose); - int count = UP_DIV(width*height, CONVOLUTION_TILED_NUMBER); + int tileCount = UP_DIV(width*height, CONVOLUTION_TILED_NUMBER); int plane = width * height; - bool success = backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC) && backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC); + bool success = backend()->onAcquireBuffer(&mTempBufferTranspose, Backend::DYNAMIC); if (!success) { return OUT_OF_MEMORY; } - auto hDiv = MNNGetC4DivNumber(hP); auto outputChannel = output->channel(); - auto oC4 = UP_DIV(outputChannel, 4); - std::shared_ptr cache; - if (hP % 4 != 0) { - cache.reset(Tensor::createDevice({threadNumber, 4 * hDiv * eP + oC4 * 4 * eP})); - success = backend()->onAcquireBuffer(cache.get(), Backend::DYNAMIC); - if (!success) { - return OUT_OF_MEMORY; - } - backend()->onReleaseBuffer(cache.get(), Backend::DYNAMIC); + auto oC4 = UP_DIV(outputChannel, unit); + auto bufferAlloc = static_cast(backend())->getBufferAllocator(); + auto maxLine = UP_DIV(CONVOLUTION_TILED_NUMBER, width) + 1; + auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float*))); + if (nullptr == tempPtr.first) { + return OUT_OF_MEMORY; } - - backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC); backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC); + bufferAlloc->free(tempPtr); std::vector parameters(6); - parameters[0] = eP * sizeof(float); + parameters[0] = eP * bytes; parameters[1] = L; parameters[2] = outputChannel; - parameters[3] = plane * 4 * sizeof(float); + parameters[3] = plane * unit * bytes; parameters[4] = 0; parameters[5] = 0; - auto threadNumberFirst = std::min(threadNumber, count); + auto threadNumberFirst = std::min(threadNumber, tileCount); auto postParameters = getPostParameters(); mFunction.first = threadNumberFirst; auto strideX = mCommon->strideX(); @@ -177,69 +176,83 @@ ErrorCode ConvolutionTiledExecutorBasic::onResize(const std::vector& in kernel_width = kernel_height; kernel_height = 1; } - mFunction.second = [=](int tId) { - auto colBuffer = mTempBuffer.host() + mTempBuffer.stride(0) * tId; - auto gemmBuffer = mTempBufferTranspose.host() + mTempBufferTranspose.stride(0) * tId; - float* cachePtr = nullptr; - if (nullptr != cache) { - cachePtr = cache->host() + tId * cache->stride(0); - } - for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) { - auto dstOrigin = output->host() + batchIndex * output->stride(0); - auto srcOrigin = input->host() + batchIndex * input->stride(0); - for (int x = (int)tId; x < count; x += threadNumberFirst) { + auto outputBatchStride = width * height * oC4 * unit; + auto inputBatchStride = src_width * src_height * icC4 * unit; + mFunction.second = [=](int tId) { + auto gemmBuffer = mTempBufferTranspose.host() + mTempBufferTranspose.stride(0) * tId; + auto srcPtr = (float const**)((uint8_t*)tempPtr.first + tempPtr.second + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float*))); + auto el = (int32_t*)(srcPtr + kernelSize * maxLine); + + int32_t info[4]; + info[1] = src_width * src_height; + info[2] = eP; + info[3] = strideX; + for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) { + auto dstOrigin = output->host() + batchIndex * outputBatchStride * bytes; + auto srcOrigin = input->host() + batchIndex * inputBatchStride * bytes; + + for (int x = (int)tId; x < tileCount; x += threadNumberFirst) { int start = (int)x * CONVOLUTION_TILED_NUMBER; int remain = plane - start; int xC = remain > CONVOLUTION_TILED_NUMBER ? CONVOLUTION_TILED_NUMBER : remain; - // Im2Col - ::memset(colBuffer, 0, mTempBuffer.stride(0) * sizeof(float)); + // Compute Pack position int oyBegin = start / width; int oxBegin = start % width; int oyEnd = (start + xC-1) / width; remain = xC; - auto colIndex = colBuffer; + int number = 0; + bool needZero = false; + int eStart = 0; for (int oy=oyBegin; oy <= oyEnd; ++oy) { int step = std::min(width - oxBegin, remain); int sySta = oy * strideY - padY; int kyStart = std::max(0, UP_DIV(-sySta, dilateY)); int kyEnd = std::min(kernel_height, UP_DIV(src_height - sySta, dilateY)); - for (int i=0; i sta) { + auto lOffset = lKYOffset + (kx * ic); + auto srcKx = srcKy + ((oxBegin + sta) * strideX + dilateX * kx - padX) * bytes * unit; + srcPtr[number] = (const float*)srcKx; + el[4 * number + 0] = end - sta; + el[4 * number + 1] = ic; + el[4 * number + 2] = eStart + sta; + el[4 * number + 3] = lOffset; + number++; } } } oxBegin = 0; remain -= step; - colIndex += 4 * step; + eStart += step; + } + info[0] = number; + if (needZero || lP != 1) { + ::memset(gemmBuffer, 0, mTempBufferTranspose.stride(0)); + } + if (number > 0) { + packA((float*)gemmBuffer, srcPtr, info, el); } // GEMM - MNNPackC4ForMatMul_A(gemmBuffer, colBuffer, CONVOLUTION_TILED_NUMBER * kernelSize, ic, CONVOLUTION_TILED_NUMBER * kernelSize); if (xC == CONVOLUTION_TILED_NUMBER) { - MNNPackedMatMul(dstOrigin + start * 4, gemmBuffer, weightPtr, parameters.data(), cachePtr, postParameters.data(), biasPtr); + matmulUnit((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, parameters.data(), postParameters.data(), biasPtr); } else { - MNNPackedMatMulRemain(dstOrigin + start * 4, gemmBuffer, weightPtr, xC, parameters.data(), cachePtr, postParameters.data(), biasPtr); + matmulRemain((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters.data(), postParameters.data(), biasPtr); } } } diff --git a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp index a16bc9d6..8526c3d6 100644 --- a/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp +++ b/source/backend/cpu/compute/ConvolutionTiledExecutor.hpp @@ -23,7 +23,6 @@ public: virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; protected: - Tensor mTempBuffer; Tensor mTempBufferTranspose; std::pair> mFunction; }; diff --git a/source/backend/cpu/compute/ConvolutionWinograd.cpp b/source/backend/cpu/compute/ConvolutionWinograd.cpp index 5a9b7fa6..48abe3c0 100644 --- a/source/backend/cpu/compute/ConvolutionWinograd.cpp +++ b/source/backend/cpu/compute/ConvolutionWinograd.cpp @@ -28,18 +28,15 @@ ConvolutionWinograd::ConvolutionWinograd(const Convolution2DCommon *convOp, cons Backend *b, const float *originWeight, size_t originWeightSize, const float *bias, size_t biasSize, int unit) : MNN::CPUConvolution(convOp, b) { + auto core = static_cast(backend())->functions(); + int pack = core->pack, bytes = core->bytes; mResource.reset(new Resource); mResource->backend = b; - mResource->mBias.reset(Tensor::createDevice({ALIGN_UP4((int)biasSize)})); - mValid = backend()->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC); - if (!mValid) { + if (!mResource->copyBiasAlign(bias, biasSize)) { + MNN_ERROR("Not Enough Memory\n"); + mValid = false; return; } - - ::memset(mResource->mBias->host(), 0, mResource->mBias->size()); - ::memcpy(mResource->mBias->host(), bias, biasSize * sizeof(float)); - mTempBuffer.buffer().type = halide_type_of(); - mTransformMidBuffer.buffer().type = halide_type_of(); MNN_ASSERT(mCommon->kernelX() == mCommon->kernelY()); int threadNumber = ((CPUBackend *)backend())->threadNumber(); @@ -49,55 +46,46 @@ ConvolutionWinograd::ConvolutionWinograd(const Convolution2DCommon *convOp, cons int alpha = unit + kernelSize - 1; int alpha2 = alpha * alpha; - mSourceTransform = WinogradFunction::chooseSourceTransform(alpha, alpha); - mDestTransform = WinogradFunction::chooseDestTransform(alpha, unit); + mSourceTransform = core->chooseWinoSourceTransform(alpha, alpha); + mDestTransform = core->chooseWinoDestTransform(alpha, unit); int srcCount = input->channel(); int outputCount = output->channel(); - auto ic4 = UP_DIV(srcCount, 4); - auto oc4 = UP_DIV(outputCount, 4); + auto ic4 = UP_DIV(srcCount, pack); + auto oc4 = UP_DIV(outputCount, pack); int ePack, hPack, lPack; - MNNGetMatMulPackMode(&ePack, &lPack, &hPack); - if (hPack % 4 != 0) { - auto hDiv = MNNGetC4DivNumber(hPack); - mCacheBuffer.buffer().dimensions = 2; - mCacheBuffer.buffer().dim[0].extent = threadNumber; - mCacheBuffer.buffer().dim[1].extent = hDiv * ePack * 4 + ePack * 4 * oc4; - TensorUtils::setLinearLayout(&mCacheBuffer); - } else { - mCacheBuffer.buffer().dimensions = 0; - } + core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); - mTempBuffer.buffer().dim[0].extent = threadNumber; - mTempBuffer.buffer().dim[1].extent = ePack; - mTempBuffer.buffer().dim[2].extent = ic4 + oc4; - mTempBuffer.buffer().dim[3].extent = 4 * alpha2; - TensorUtils::setLinearLayout(&mTempBuffer); + mTempBuffer.reset(Tensor::createDevice({threadNumber, ePack, ic4 + oc4, pack * alpha2, bytes})); + mTransformMidBuffer.reset(Tensor::createDevice({threadNumber, 2, alpha2, pack, bytes})); + mGemmMidBuffer.reset(Tensor::createDevice({threadNumber, ePack * UP_DIV(srcCount, lPack) * lPack, bytes})); - mTransformMidBuffer.buffer().dim[0].extent = threadNumber; - mTransformMidBuffer.buffer().dim[1].extent = 2; - mTransformMidBuffer.buffer().dim[2].extent = alpha2; - mTransformMidBuffer.buffer().dim[3].extent = 4; - TensorUtils::setLinearLayout(&mTransformMidBuffer); - - mGemmMidBuffer.buffer().dim[0].extent = threadNumber; - mGemmMidBuffer.buffer().dim[1].extent = ePack * ic4 * 4; - mGemmMidBuffer.buffer().dimensions = 2; - TensorUtils::setLinearLayout(&mGemmMidBuffer); mA = generator.A(); mB = generator.B(); // Transform Kernel auto G = generator.G(); + // replace Tensor::createDevice by Tensor::create and allocTransformWeight's alloc=true to avoid malloc by onAcquireBuffer std::shared_ptr sourceWeight(Tensor::create( std::vector{outputCount, srcCount, kernelSize, kernelSize}, (void *)originWeight, Tensor::CAFFE)); - mResource->mWeight = generator.allocTransformWeight(sourceWeight.get(), 1, hPack, false); - mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); + auto tempWeight = generator.allocTransformWeight(sourceWeight.get(), lPack, hPack, true); + + auto shape = tempWeight->shape(); + shape.push_back(bytes); + mResource->mWeight.reset(Tensor::createDevice(shape)); + mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); if (!mValid) { return; } - generator.transformWeight(mResource->mWeight.get(), sourceWeight.get()); + generator.transformWeight(tempWeight.get(), sourceWeight.get(), true); + if (bytes != 4) { + core->MNNFp32ToLowp(tempWeight->host(), mResource->mWeight->host(), tempWeight->elementSize()); + } else { + ::memcpy(mResource->mWeight->host(), tempWeight->host(), tempWeight->size()); + } + + mPostParameters = getPostParameters(); } ConvolutionWinograd::~ConvolutionWinograd() { // Do nothing @@ -112,23 +100,26 @@ bool ConvolutionWinograd::onClone(Backend* bn, const Op* op, Execution** dst) { auto dstExe = new ConvolutionWinograd(mResource, op->main_as_Convolution2D()->common(), bn); dstExe->mA = mA; dstExe->mB = mB; - TensorUtils::copyShape(&mCacheBuffer, &(dstExe->mCacheBuffer), true); - TensorUtils::copyShape(&mTempBuffer, &(dstExe->mTempBuffer), true); - TensorUtils::copyShape(&mTransformMidBuffer, &(dstExe->mTransformMidBuffer), true); - TensorUtils::copyShape(&mGemmMidBuffer, &(dstExe->mGemmMidBuffer), true); + dstExe->mTempBuffer.reset(Tensor::createDevice(mTempBuffer->shape())); + dstExe->mTransformMidBuffer.reset(Tensor::createDevice(mTransformMidBuffer->shape())); + dstExe->mGemmMidBuffer.reset(Tensor::createDevice(mGemmMidBuffer->shape())); dstExe->mSourceTransform = mSourceTransform; dstExe->mDestTransform = mDestTransform; + dstExe->mPostParameters = mPostParameters; *dst = dstExe; return true; } ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, const std::vector &outputs) { + auto core = static_cast(backend())->functions(); + int pack = core->pack, bytes = core->bytes; + auto input = inputs[0]; auto output = outputs[0]; auto dstUnit = mA->length(1); auto srcUnit = mA->length(0); int ePack, lPack, hPack; - MNNGetMatMulPackMode(&ePack, &lPack, &hPack); + core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); auto srcUnit2 = srcUnit * srcUnit; @@ -136,8 +127,8 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, co int oh = output->height(); int iw = input->width(); int ih = input->height(); - int ic_4 = UP_DIV(input->channel(), 4); - int dc_4 = UP_DIV(output->channel(), 4); + int ic_4 = UP_DIV(input->channel(), pack); + int dc_4 = UP_DIV(output->channel(), pack); // MNN_PRINT("%d, %d\n", srcUnit, dstUnit); int padY = mPadY; @@ -147,37 +138,35 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, co auto hUnit = UP_DIV(oh, dstUnit); auto totalCount = wUnit * hUnit; - auto postFunction = mPostFunction; // MNN_PRINT("ow=%d, oh=%d\n", ow, oh); int threadNumber = std::max(((CPUBackend *)backend())->threadNumber(), 1); int tileCount = UP_DIV(totalCount, ePack); int eRemain = totalCount % ePack; threadNumber = std::min(threadNumber, tileCount); std::vector parameters(6); - parameters[0] = eRemain * sizeof(float); + parameters[0] = eRemain * bytes; parameters[1] = input->channel(); parameters[2] = output->channel(); - parameters[3] = ePack * 4 * sizeof(float); + parameters[3] = ePack * pack * bytes; parameters[4] = 0; parameters[5] = 0; std::vector parametersRemain = parameters; - parametersRemain[3] = eRemain * 4 * sizeof(float); - + parametersRemain[3] = eRemain * pack * bytes; + auto inputOrigin = input->host(); + auto outputOrigin = output->host(); for (int batchIndex = 0; batchIndex < input->batch(); ++batchIndex) { - auto srcOrigin = input->host() + batchIndex * input->stride(0); - auto dstOrigin = output->host() + batchIndex * output->stride(0); + auto srcOrigin = inputOrigin + batchIndex * ic_4 * iw * ih * pack * bytes; + auto dstOrigin = outputOrigin + batchIndex * dc_4 * ow * oh * pack * bytes; - auto weight = mResource->mWeight->host(); - auto bias = mResource->mBias->host(); + auto weight = mResource->mWeight->host(); + auto bias = mResource->mBias->host(); auto tFunction = [&](int tId) { - auto _srcOrigin = mTempBuffer.host() + tId * mTempBuffer.stride(0); - auto gemmBuffer = mGemmMidBuffer.host() + tId * mGemmMidBuffer.stride(0); - auto cache = mCacheBuffer.host() + tId * mCacheBuffer.stride(0); - auto midBuffer0 = mTransformMidBuffer.host() + tId * mTransformMidBuffer.stride(0); - auto midBuffer1 = - mTransformMidBuffer.host() + tId * mTransformMidBuffer.stride(0) + mTransformMidBuffer.stride(1); + auto _srcOrigin = mTempBuffer->host() + tId * mTempBuffer->stride(0); + auto gemmBuffer = (float*)(mGemmMidBuffer->host() + tId * mGemmMidBuffer->stride(0)); + auto midBuffer0 = mTransformMidBuffer->host() + tId * mTransformMidBuffer->stride(0); + auto midBuffer1 = midBuffer0 + mTransformMidBuffer->stride(1); for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) { int xIndex = (int)tIndex * ePack; int xReamin = totalCount - xIndex; @@ -186,9 +175,9 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, co /*Source Transform Begin*/ #ifndef MNN_WINO_TRANFORM_TEST_CLOSE { - int sourceZStep = iw * ih * 4; - int dstZStep = xC * 4; - int unitStep = ic_4 * xC * 4; + int sourceZStep = iw * ih * pack; + int dstZStep = xC * pack; + int unitStep = ic_4 * xC * pack; int oyBegin = xIndex / wUnit; int oxBegin = xIndex % wUnit; int oyEnd = (xIndex + xC-1) / wUnit; @@ -204,73 +193,96 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, co int srcX = wIndex * dstUnit - padX; int sx = ALIMAX(0, srcX) - srcX; int ex = ALIMIN(srcX + srcUnit, iw) - srcX; - int count = 4 * (ex - sx); - auto dst_x = dstS + 4 * si; - auto srcStart = srcOrigin + (srcX + srcY * iw) * 4; + int count = pack * (ex - sx); + auto dst_x = dstS + si * pack * bytes; + auto srcStart = srcOrigin + (srcX + srcY * iw) * pack * bytes; if (ex - sx == srcUnit && ey - sy == srcUnit) { for (int z = 0; z < ic_4; ++z) { - auto srcZ = srcStart + z * sourceZStep; + auto srcZ = srcStart + z * sourceZStep * bytes; // Transform for (int i = 0; i < srcUnit; ++i) { - mSourceTransform(srcZ + 4 * i * iw, midBuffer1 + 4 * i, 4, 4 * srcUnit); + auto srcFloatPtr = (const float*)(srcZ + i * iw * pack * bytes); + auto dstFloatPtr = (float*)(midBuffer1 + i * pack * bytes); + mSourceTransform(srcFloatPtr, dstFloatPtr, pack, pack * srcUnit); } - auto dstZ = dst_x + z * dstZStep; + auto dstZ = dst_x + z * dstZStep * bytes; for (int i = 0; i < srcUnit; ++i) { - mSourceTransform(midBuffer1 + 4 * i * srcUnit, dstZ + i * unitStep, 4, + auto srcFloatPtr = (const float*)(midBuffer1 + i * srcUnit * pack * bytes); + auto dstFloatPtr = (float*)(dstZ + i * unitStep * bytes); + mSourceTransform(srcFloatPtr, dstFloatPtr, pack, unitStep * srcUnit); } } } else { for (int z = 0; z < ic_4; ++z) { // Extract - auto srcZ = srcStart + z * sourceZStep; - ::memset(midBuffer0, 0, mTransformMidBuffer.stride(1) * sizeof(float)); + auto srcZ = srcStart + z * sourceZStep * bytes; + ::memset(midBuffer0, 0, mTransformMidBuffer->stride(1)); if (count > 0) { for (int yy = sy; yy < ey; ++yy) { - auto dst_yy = midBuffer0 + yy * srcUnit * 4 + sx * 4; - auto src_yy = srcZ + 4 * iw * yy + sx * 4; - ::memcpy(dst_yy, src_yy, count * sizeof(float)); + auto dst_yy = midBuffer0 + (yy * srcUnit + sx) * pack * bytes; + auto src_yy = srcZ + (iw * yy + sx) * pack * bytes; + ::memcpy(dst_yy, src_yy, count * bytes); } } // Transform for (int i = 0; i < srcUnit; ++i) { - mSourceTransform(midBuffer0 + 4 * i * srcUnit, midBuffer1 + 4 * i, 4, 4 * srcUnit); + auto srcFloatPtr = (const float*)(midBuffer0 + i * srcUnit * pack * bytes); + auto dstFloatPtr = (float*)(midBuffer1 + i * pack * bytes); + mSourceTransform(srcFloatPtr, dstFloatPtr, pack, pack * srcUnit); } - auto dstZ = dst_x + z * dstZStep; + auto dstZ = dst_x + z * dstZStep * bytes; for (int i = 0; i < srcUnit; ++i) { - mSourceTransform(midBuffer1 + 4 * i * srcUnit, dstZ + i * unitStep, 4, - unitStep * srcUnit); + auto srcFloatPtr = (const float*)(midBuffer1 + i * srcUnit * pack * bytes); + auto dstFloatPtr = (float*)(dstZ + i * unitStep * bytes); + mSourceTransform(srcFloatPtr, dstFloatPtr, pack, unitStep * srcUnit); } } } } oxBegin = 0; remain -= step; - dstS += 4 * step; + dstS += pack * step * bytes; } } /*Source Transform End*/ #endif // Multi - auto _dstOrigin = _srcOrigin + xC * srcUnit2 * ic_4 * 4; + auto _dstOrigin = _srcOrigin + xC * srcUnit2 * ic_4 * pack * bytes; + int32_t info[4]; + info[0] = 1; + info[1] = xC; + info[2] = xC; + info[3] = 1; + int32_t el[4]; + el[0] = xC; + el[1] = parameters[1]; + el[2] = 0; + el[3] = 0; if (xC == ePack) { for (int i = 0; i < srcUnit2; ++i) { - MNNPackC4ForMatMul_A(gemmBuffer, _srcOrigin + i * ic_4 * 4 * xC, ePack, ic_4 * 4, ePack); - MNNPackedMatMul(_dstOrigin + i * dc_4 * 4 * xC, gemmBuffer, weight + i * mResource->mWeight->stride(0), parameters.data(), cache, nullptr, nullptr); + auto srcTemp = (const float*)(_srcOrigin + i * ic_4 * pack * xC * bytes); + auto _dstFloatPtr = (float*)(_dstOrigin + i * dc_4 * pack * xC * bytes); + auto _weightFloatPtr = (const float*)(weight + i * mResource->mWeight->stride(0)); + core->MNNPackC4ForMatMul_A(gemmBuffer, &srcTemp, info, el); + core->MNNPackedMatMul(_dstFloatPtr, gemmBuffer, _weightFloatPtr, parameters.data(), nullptr, nullptr); } } else { for (int i = 0; i < srcUnit2; ++i) { - MNNPackC4ForMatMul_A(gemmBuffer, _srcOrigin + i * ic_4 * 4 * xC, xC, ic_4 * 4, xC); - MNNPackedMatMulRemain(_dstOrigin + i * dc_4 * 4 * xC, gemmBuffer, weight + i * mResource->mWeight->stride(0), xC, parametersRemain.data(), cache, nullptr, nullptr); + auto srcTemp = (const float*)(_srcOrigin + i * ic_4 * pack * xC * bytes); + auto _dstFloatPtr = (float*)(_dstOrigin + i * dc_4 * pack * xC * bytes); + auto _weightFloatPtr = (const float*)(weight + i * mResource->mWeight->stride(0)); + core->MNNPackC4ForMatMul_A(gemmBuffer, &srcTemp, info, el); + core->MNNPackedMatMulRemain(_dstFloatPtr, gemmBuffer, _weightFloatPtr, xC, parametersRemain.data(), nullptr, nullptr); } } #ifndef MNN_WINO_TRANFORM_TEST_CLOSE /* Dest Transform And Post Treat Begin */ { - int dstZStep = ow * oh * 4; - int srcZStep = xC * 4; - int unitStep = dc_4 * xC * 4; + int dstZStep = ow * oh * pack; + int srcZStep = xC * pack; + int unitStep = dc_4 * xC * pack; int oyBegin = xIndex / wUnit; int oxBegin = xIndex % wUnit; int oyEnd = (xIndex + xC-1) / wUnit; @@ -282,49 +294,54 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, co int ey = ALIMIN(dstY + dstUnit, oh) - dstY; for (int si=0; si &inputs, co MNN_CONCURRENCY_BEGIN(tId, threadNumber) { for (int dy=(int)tId; dy < dc_4; dy += threadNumber) { - postFunction(dstOrigin + 4 * ow * oh * dy, bias + 4* dy, ow * oh, 1); + auto dataFloatPtr = (float*)(dstOrigin + ow * oh * dy * pack * bytes); + auto biasFloatPtr = (const float*)(bias + pack * dy * bytes); + core->MNNAxByClampBroadcastUnit(dataFloatPtr, dataFloatPtr, biasFloatPtr, ow * oh, 0, 0, 1, mPostParameters.data()); } } MNN_CONCURRENCY_END(); @@ -349,12 +368,13 @@ ErrorCode ConvolutionWinograd::onExecute(const std::vector &inputs, co } int ConvolutionWinograd::bestWinogradUnit(const Convolution2DCommon *common, const Tensor *inputTensor, - const Tensor *outputTensor, int threadNumber) { + const Tensor *outputTensor, int threadNumber, Backend* b) { + auto core = static_cast(b)->functions(); int ow = outputTensor->width(); int oh = outputTensor->height(); int oc = outputTensor->channel(); int ePack, hPack, lPack; - MNNGetMatMulPackMode(&ePack, &lPack, &hPack); + core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); int unit2 = UP_DIV(ow * oh, ePack * threadNumber); int maxUnit = (int)::sqrtf((float)unit2); maxUnit = std::min(maxUnit, CONVOLUTION_WINOGRAD_MAX_UNIT); @@ -365,14 +385,14 @@ int ConvolutionWinograd::bestWinogradUnit(const Convolution2DCommon *common, con int unit = 0; float maxRate = 0.0f; float originCost = (float)ow * oh * (float)ic * oc * kernelSize * kernelSize; - static std::set supportSu{4, 6, 8}; + std::set supportSu{4, 6, 8}; for (int u = CONVOLUTION_WINOGRAD_MIN_UNIT; u <= maxUnit; ++u) { auto sui = u + kernelSize - 1; auto su = (float)sui; if (supportSu.find(sui) == supportSu.end()) { continue; } - if (nullptr == WinogradFunction::chooseDestTransform((int)su, u)) { + if (nullptr == core->chooseWinoDestTransform((int)su, u)) { continue; } /*Let F(6,3) be choosed when it can speed up from F(2,3) than 0.6*/ @@ -408,18 +428,12 @@ bool ConvolutionWinograd::canUseWinograd(const Convolution2DCommon *common) { ErrorCode ConvolutionWinograd::onResize(const std::vector &inputs, const std::vector &outputs) { CPUConvolution::onResize(inputs, outputs); // FUNC_PRINT(mA->length(1)); - bool success = backend()->onAcquireBuffer(&mTempBuffer, Backend::DYNAMIC); - success = success && backend()->onAcquireBuffer(&mGemmMidBuffer, Backend::DYNAMIC); - success = success && (backend()->onAcquireBuffer(&mTransformMidBuffer, Backend::DYNAMIC)); - if (mCacheBuffer.buffer().dimensions > 0) { - success = success && backend()->onAcquireBuffer(&mCacheBuffer, Backend::DYNAMIC); - } - backend()->onReleaseBuffer(&mTempBuffer, Backend::DYNAMIC); - backend()->onReleaseBuffer(&mTransformMidBuffer, Backend::DYNAMIC); - backend()->onReleaseBuffer(&mGemmMidBuffer, Backend::DYNAMIC); - if (mCacheBuffer.buffer().dimensions > 0) { - backend()->onReleaseBuffer(&mCacheBuffer, Backend::DYNAMIC); - } + bool success = backend()->onAcquireBuffer(mTempBuffer.get(), Backend::DYNAMIC); + success = success && backend()->onAcquireBuffer(mGemmMidBuffer.get(), Backend::DYNAMIC); + success = success && (backend()->onAcquireBuffer(mTransformMidBuffer.get(), Backend::DYNAMIC)); + backend()->onReleaseBuffer(mTempBuffer.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mTransformMidBuffer.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mGemmMidBuffer.get(), Backend::DYNAMIC); if (!success) { return OUT_OF_MEMORY; } diff --git a/source/backend/cpu/compute/ConvolutionWinograd.hpp b/source/backend/cpu/compute/ConvolutionWinograd.hpp index 0bc09e3a..8075446f 100644 --- a/source/backend/cpu/compute/ConvolutionWinograd.hpp +++ b/source/backend/cpu/compute/ConvolutionWinograd.hpp @@ -11,7 +11,7 @@ #include "backend/cpu/CPUConvolution.hpp" #include "backend/cpu/compute/ConvolutionFloatFactory.h" -#include "backend/cpu/compute/WinogradOptFunction.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" namespace MNN { class ConvolutionWinograd : public CPUConvolution { @@ -25,7 +25,7 @@ public: static bool canUseWinograd(const Convolution2DCommon *convOp); static int bestWinogradUnit(const Convolution2DCommon *convOp, const Tensor *input, const Tensor *output, - int threadnumber); + int threadnumber, Backend* b); virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; private: ConvolutionWinograd(std::shared_ptr resource, const Convolution2DCommon *convOp, Backend* b) : CPUConvolution(convOp, b) { @@ -35,13 +35,13 @@ private: std::shared_ptr mA; std::shared_ptr mB; - Tensor mTempBuffer; - Tensor mTransformMidBuffer; - Tensor mGemmMidBuffer; - Tensor mCacheBuffer; + std::shared_ptr mTempBuffer; + std::shared_ptr mTransformMidBuffer; + std::shared_ptr mGemmMidBuffer; - WinogradFunction::TransformFunc mSourceTransform; - WinogradFunction::TransformFunc mDestTransform; + CoreFunctions::WinoTransFunc mSourceTransform; + CoreFunctions::WinoTransFunc mDestTransform; + std::vector mPostParameters; }; } // namespace MNN #endif /* ConvolutionWinograd_hpp */ diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.cpp b/source/backend/cpu/compute/DeconvolutionWithStride.cpp index 0ff6f6dd..19e53780 100644 --- a/source/backend/cpu/compute/DeconvolutionWithStride.cpp +++ b/source/backend/cpu/compute/DeconvolutionWithStride.cpp @@ -8,9 +8,9 @@ #include "backend/cpu/compute/DeconvolutionWithStride.hpp" #include "backend/cpu/CPUBackend.hpp" -#include "backend/cpu/compute/CommonOptFunction.h" +#include "CommonOptFunction.h" #include "core/Concurrency.h" -#include "backend/cpu/compute/ConvOpt.h" +#include "ConvOpt.h" #include "core/Macro.h" #include "math/WingoradGenerater.hpp" #include "backend/cpu/compute/WinogradOptFunction.hpp" @@ -28,18 +28,19 @@ namespace MNN { static const int gDefaultUnit = 3; static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int threadId, int strideX, int strideY, const Tensor* src, const Tensor* dst, std::map>& sourceTransformMap, - std::map& sourceTransformed) { - auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber(); + std::map& sourceTransformed, float* cachePackBuffer, int ic, int oc) { + int eP, lP, hP; + MNNGetMatMulPackMode(&eP, &lP, &hP); auto srcUnit = unit.winogradInfo.srcUnitX; auto buffer = sourceTransformMap[srcUnit]; // We allocated the buffer with 2*numberThread int numberThread = buffer->length(0) / 2; auto dstUnit = gDefaultUnit; - int dc_4 = dst->length(3) / 4 / CONVOLUTION_TILED_NUMBER; + int dc_4 = dst->length(3) / 4 / eP; int srcCount = src->stride(2); int totalCount = dst->stride(2); - int ic_4 = srcCount / CONVOLUTION_TILED_NUMBER / 4; + int ic_4 = srcCount / eP / 4; auto dstTotal = dst->host() + threadId * dst->stride(0); auto srcTotal = src->host() + threadId * src->stride(0); @@ -49,28 +50,47 @@ static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int thre auto destAddr = buffer->host() + (threadId)*buffer->stride(0); WinogradFunction::productLeft(srcTotal, A->host(), midAddr, dstUnit, srcUnit, dstUnit, - ic_4 * CONVOLUTION_TILED_NUMBER); + ic_4 * eP); WinogradFunction::productRight(midAddr, A->host(), destAddr, srcUnit, srcUnit, dstUnit, - ic_4 * CONVOLUTION_TILED_NUMBER); + ic_4 * eP); sourceTransformed[srcUnit] = true; } auto sourceAddr = buffer->host() + (threadId)*buffer->stride(0); auto destAddr = unit.dstBuffer->host() + threadId * unit.dstBuffer->stride(0); + int32_t info[4]; + info[0] = 1; + info[1] = eP; + info[2] = eP; + info[3] = 1; + int32_t el[4]; + el[0] = eP; + el[1] = ic; + el[2] = 0; + el[3] = 0; + size_t parameters[6]; + parameters[0] = eP * sizeof(float); + parameters[1] = ic; + parameters[2] = oc; + parameters[3] = eP * 4 * sizeof(float); + parameters[4] = 0; + parameters[5] = 0; + for (int i = 0; i < srcUnit * srcUnit; ++i) { - auto tempSourceAddr = sourceAddr + i * buffer->stride(2); + const float* tempSourceAddr = sourceAddr + i * buffer->stride(2); auto tempColAddr = destAddr + i * unit.dstBuffer->stride(1); auto weightAddr = unit.weight->host() + unit.weight->stride(0) * i; - MNNGemmFloatUnit_4(tempColAddr, tempSourceAddr, weightAddr, ic_4, CONVOLUTION_TILED_NUMBER * 4, dc_4, 0); + MNNPackC4ForMatMul_A(cachePackBuffer, &tempSourceAddr, info, el); + MNNPackedMatMul(tempColAddr, cachePackBuffer,weightAddr, parameters, nullptr, nullptr); } auto B = unit.winogradInfo.B.get(); auto midAddr = unit.winogradInfo.dstTransformedBuffer->host() + threadId * unit.winogradInfo.dstTransformedBuffer->stride(0); WinogradFunction::productLeft(destAddr, B->host(), midAddr, srcUnit, srcUnit, srcUnit, - dc_4 * CONVOLUTION_TILED_NUMBER); + dc_4 * eP); WinogradFunction::productRight(midAddr, B->host(), destAddr, srcUnit, srcUnit, srcUnit, - dc_4 * CONVOLUTION_TILED_NUMBER); + dc_4 * eP); // Add to dest for (int fy = 0; fy < srcUnit; ++fy) { @@ -85,21 +105,48 @@ static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int thre } static void _gemmAndIm2col(const DeconvolutionWithStride::ComputeUnit& unit, int threadId, int strideX, int strideY, - const Tensor* src, const Tensor* dst) { + const Tensor* src, const Tensor* dst, float* cachePackBuffer, int ic, int oc) { auto tempColAddr = unit.dstBuffer->host() + unit.dstBuffer->stride(0) * threadId; - auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber(); - int ocDiv4 = dst->length(3) / 4 / CONVOLUTION_TILED_NUMBER; + int eP, lP, hP; + MNNGetMatMulPackMode(&eP, &lP, &hP); + int ocDiv4 = dst->length(3) / 4 / eP; int count = ocDiv4 * unit.xUnit * unit.yUnit; auto weightAddr = unit.weight->host(); auto dstTotal = dst->host() + threadId * dst->stride(0); auto srcTotal = src->host() + threadId * src->stride(0); int srcCount = src->stride(2); int totalCount = dst->stride(2); - int icDiv4 = srcCount / CONVOLUTION_TILED_NUMBER / 4; + int ic_4 = srcCount / eP / 4; + int dc_4 = ocDiv4; + int32_t info[4]; + info[0] = 1; + info[1] = eP; + info[2] = eP; + info[3] = 1; + int32_t el[4]; + el[0] = eP; + el[1] = ic; + el[2] = 0; + el[3] = 0; + size_t parameters[6]; + parameters[0] = eP * sizeof(float); + parameters[1] = ic; + parameters[2] = oc; + parameters[3] = eP * 4 * sizeof(float); + parameters[4] = 0; + parameters[5] = 0; + for (int dy = 0; dy < gDefaultUnit; ++dy) { for (int dx = 0; dx < gDefaultUnit; ++dx) { - auto tempSourceAddr = srcTotal + (dx + dy * gDefaultUnit) * srcCount; - MNNGemmFloatUnit_4(tempColAddr, tempSourceAddr, weightAddr, icDiv4, CONVOLUTION_TILED_NUMBER * 4, count, 0); + const float* tempSourceAddr = srcTotal + (dx + dy * gDefaultUnit) * srcCount; + MNNPackC4ForMatMul_A(cachePackBuffer, &tempSourceAddr, info, el); + for (int fy = 0; fy < unit.yUnit; ++fy) { + for (int fx = 0; fx < unit.xUnit; ++fx) { + auto ucolAddr = tempColAddr + dc_4 * eP * 4 * (fx + fy * unit.xUnit); + auto uwAddr = weightAddr + unit.weight->stride(0) * (fx + fy * unit.xUnit); + MNNPackedMatMul(ucolAddr, cachePackBuffer, uwAddr, parameters, nullptr, nullptr); + } + } // FUNC_PRINT_ALL(tempColAddr[0], f); for (int fy = 0; fy < unit.yUnit; ++fy) { @@ -123,7 +170,9 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op* int outputCount = common->outputCount(); int kx = common->kernelX(); int ky = common->kernelY(); - + int eP, lP, hP; + MNNGetMatMulPackMode(&eP, &lP, &hP); + const float* tempWeight = nullptr; int tempWeightSize = 0; int srcCount = 0; @@ -171,12 +220,12 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op* unit.winogradInfo.G = generater.G(); unit.weight.reset(Tensor::createDevice( - std::vector{sourceUnitX * sourceUnitY, UP_DIV(outputCount, 4), UP_DIV(srcCount, 4), 16})); + std::vector{sourceUnitX * sourceUnitY, UP_DIV(outputCount, hP), UP_DIV(srcCount, lP), lP * hP})); } else #endif { unit.weight.reset(Tensor::createDevice( - std::vector{unit.yUnit * unit.xUnit, UP_DIV(outputCount, 4), UP_DIV(srcCount, 4), 16})); + std::vector{unit.yUnit * unit.xUnit, UP_DIV(outputCount, hP), UP_DIV(srcCount, lP), lP * hP})); } mComputeUnits.emplace_back(unit); } @@ -188,6 +237,7 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op* return; } _extract(convOp); + mPostParameters = getPostParameters(); } bool DeconvolutionWithStride::_alloc(Backend::StorageType type) { @@ -213,6 +263,8 @@ void DeconvolutionWithStride::_extract(const Op* convOp) { int outputCount = common->outputCount(); int kx = common->kernelX(); int ky = common->kernelY(); + int eP, lP, hP; + MNNGetMatMulPackMode(&eP, &lP, &hP); const float* tempWeight = nullptr; int tempWeightSize = 0; @@ -286,22 +338,21 @@ void DeconvolutionWithStride::_extract(const Op* convOp) { auto weighStrideK = unit.weight->stride(0); ::memset(unit.weight->host(), 0, unit.weight->size()); for (int sz = 0; sz < srcCount; ++sz) { - int sz4 = sz / 4; - int my = sz % 4; - auto dstS = unit.weight->host() + 16 * sz4; + int sz4 = sz / lP; + int my = sz % lP; + auto dstS = unit.weight->host() + hP * lP * sz4; for (int oz = 0; oz < outputCount; ++oz) { - int oz4 = oz / 4; - int mx = oz % 4; + int oz4 = oz / hP; + int mx = oz % hP; auto dstO = dstS + unit.weight->stride(1) * oz4; auto src = tempWeight->host() + tempWeight->stride(0) * sz + tempWeight->stride(1) * oz; for (int fy = 0; fy < subKy; ++fy) { for (int fx = 0; fx < subKx; ++fx) { - dstO[weighStrideK * (fy * subKx + fx) + 4 * my + mx] = src[fy * subKx + fx]; + dstO[weighStrideK * (fy * subKx + fx) + my + lP * mx] = src[fy * subKx + fx]; } } } } - MNNReorder4x4ByPlatform(unit.weight->host(), unit.weight->elementSize() / 16); } } @@ -316,40 +367,43 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector& inputs, auto ic = input->channel(); auto oc = output->channel(); - auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber(); + int eP, lP, hP; + MNNGetMatMulPackMode(&eP, &lP, &hP); int numThread = std::max(1, ((CPUBackend*)backend())->threadNumber()); mSrcBuffer.reset(Tensor::createDevice( - std::vector{numThread, gDefaultUnit, gDefaultUnit, CONVOLUTION_TILED_NUMBER * ALIGN_UP4(ic)})); + std::vector{numThread, gDefaultUnit, gDefaultUnit, eP * ALIGN_UP4(ic)})); int dstXUnit = (gDefaultUnit - 1) * mCommon->strideX() + (mCommon->kernelX() - 1) * mCommon->dilateX() + 1; int dstYUnit = (gDefaultUnit - 1) * mCommon->strideY() + (mCommon->kernelY() - 1) * mCommon->dilateY() + 1; + mMatMulPackBuffer.reset(Tensor::createDevice(std::vector{numThread, eP * ALIGN_UP4(ic)})); mDestBuffer.reset(Tensor::createDevice( - std::vector{numThread, dstYUnit, dstXUnit, CONVOLUTION_TILED_NUMBER * ALIGN_UP4(oc)})); + std::vector{numThread, dstYUnit, dstXUnit, eP * ALIGN_UP4(oc)})); bool res = backend()->onAcquireBuffer(mSrcBuffer.get(), Backend::DYNAMIC); res &= backend()->onAcquireBuffer(mDestBuffer.get(), Backend::DYNAMIC); + res &= backend()->onAcquireBuffer(mMatMulPackBuffer.get(), Backend::DYNAMIC); mTransformedBuffer.clear(); for (auto& unit : mComputeUnits) { auto kxky = unit.yUnit * unit.xUnit; if (!unit.winogradInfo.open) { unit.dstBuffer.reset(Tensor::createDevice( - std::vector{numThread, UP_DIV(oc, 4) * kxky, CONVOLUTION_TILED_NUMBER, 4})); + std::vector{numThread, UP_DIV(oc, 4) * kxky, eP, 4})); res &= backend()->onAcquireBuffer(unit.dstBuffer.get(), Backend::DYNAMIC); continue; } auto srcUnit = unit.winogradInfo.srcUnitX; unit.dstBuffer.reset(Tensor::createDevice( - std::vector{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), CONVOLUTION_TILED_NUMBER * 4})); + std::vector{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), eP * 4})); res &= backend()->onAcquireBuffer(unit.dstBuffer.get(), Backend::DYNAMIC); unit.winogradInfo.dstTransformedBuffer.reset(Tensor::createDevice( - std::vector{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), CONVOLUTION_TILED_NUMBER * 4})); + std::vector{numThread, srcUnit * srcUnit, UP_DIV(oc, 4), eP * 4})); res &= backend()->onAcquireBuffer(unit.winogradInfo.dstTransformedBuffer.get(), Backend::DYNAMIC); if (mTransformedBuffer.find(srcUnit) == mTransformedBuffer.end()) { // We Need 2 buffer for transform, one for mid buffer and one for dest std::shared_ptr transformBuffer = std::shared_ptr(Tensor::createDevice( - std::vector{2 * numThread, srcUnit, srcUnit, CONVOLUTION_TILED_NUMBER * ALIGN_UP4(ic)})); + std::vector{2 * numThread, srcUnit, srcUnit, eP * ALIGN_UP4(ic)})); mTransformedBuffer[srcUnit] = transformBuffer; } } @@ -368,6 +422,7 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector& inputs, } backend()->onReleaseBuffer(mSrcBuffer.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(mDestBuffer.get(), Backend::DYNAMIC); + backend()->onReleaseBuffer(mMatMulPackBuffer.get(), Backend::DYNAMIC); for (auto& iter : mTransformedBuffer) { backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC); @@ -398,15 +453,15 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector& inputs, int strideX = mStrideX; int strideY = mStrideY; - auto CONVOLUTION_TILED_NUMBER = MNNGetConvolutionTileNumber(); + int eP, lP, hP; + MNNGetMatMulPackMode(&eP, &lP, &hP); - auto postFunction = mPostFunction; // FUNC_PRINT(mPadX); // FUNC_PRINT(mPadY); int wUnit = UP_DIV(iw, gDefaultUnit); int hUnit = UP_DIV(ih, gDefaultUnit); - int tileCount = UP_DIV(wUnit * hUnit, CONVOLUTION_TILED_NUMBER); + int tileCount = UP_DIV(wUnit * hUnit, eP); int numThread = std::max(1, ((CPUBackend*)backend())->threadNumber()); numThread = std::min(numThread, tileCount); @@ -418,12 +473,13 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector& inputs, auto threadFunction = [&](int threadId) { auto srcTotal = mSrcBuffer->host() + threadId * mSrcBuffer->stride(0); auto dstTotal = mDestBuffer->host() + threadId * mDestBuffer->stride(0); + auto packBuffer = mMatMulPackBuffer->host() + threadId * mMatMulPackBuffer->stride(0); for (int tIndex = (int)threadId; tIndex < tileCount; tIndex += numThread) { // Move Source to tile Source - int xIndex = tIndex * CONVOLUTION_TILED_NUMBER; - int xCount = std::min(CONVOLUTION_TILED_NUMBER, wUnit * hUnit - xIndex); + int xIndex = tIndex * eP; + int xCount = std::min(eP, wUnit * hUnit - xIndex); { - int destUnitStride = icDiv4 * CONVOLUTION_TILED_NUMBER * 4; + int destUnitStride = icDiv4 * eP * 4; for (int index = 0; index < xCount; ++index) { int whIndex = xIndex + index; int wIndex = whIndex % wUnit; @@ -444,17 +500,17 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector& inputs, #endif for (int z = 0; z < icDiv4; ++z) { #ifdef MNN_USE_NEON - vst1q_f32(dstUnit + 4 * CONVOLUTION_TILED_NUMBER * z, zero); + vst1q_f32(dstUnit + 4 * eP * z, zero); #else for (int j = 0; j < 4; ++j) { - dstUnit[4 * CONVOLUTION_TILED_NUMBER * z + j] = 0; + dstUnit[4 * eP * z + j] = 0; } #endif } continue; } auto srcUnit = srcStart + (subX + subY * iw) * 4; - MNNCopyC4WithStride(srcUnit, dstUnit, iZstep, CONVOLUTION_TILED_NUMBER * 4, icDiv4); + MNNCopyC4WithStride(srcUnit, dstUnit, iZstep, eP * 4, icDiv4); } } } @@ -469,20 +525,20 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector& inputs, for (auto& unit : mComputeUnits) { if (unit.winogradInfo.open) { _winograd(unit, (int)threadId, strideX, strideY, mSrcBuffer.get(), mDestBuffer.get(), - mTransformedBuffer, transformed); + mTransformedBuffer, transformed, packBuffer, ic, oc); } else { - _gemmAndIm2col(unit, (int)threadId, strideX, strideY, mSrcBuffer.get(), mDestBuffer.get()); + _gemmAndIm2col(unit, (int)threadId, strideX, strideY, mSrcBuffer.get(), mDestBuffer.get(), packBuffer, ic, oc); } } // Merge to Dest { std::unique_lock __l(mLock); - int srcUnitStride = ocDiv4 * CONVOLUTION_TILED_NUMBER * 4; + int srcUnitStride = ocDiv4 * eP * 4; int destXUnit = mDestBuffer->length(2); int destYUnit = mDestBuffer->length(1); for (int index = 0; index < xCount; ++index) { - int whIndex = tIndex * CONVOLUTION_TILED_NUMBER + index; + int whIndex = tIndex * eP + index; int wIndex = whIndex % wUnit; int hIndex = whIndex / wUnit; @@ -500,7 +556,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector& inputs, for (int subX = xStart; subX < xEnd; ++subX) { auto srcUnit = srcStart + (subX + subY * destXUnit) * srcUnitStride; auto dstUnit = dstStart + (subX + subY * ow) * 4; - MNNAddC4WithStride(srcUnit, dstUnit, 4 * CONVOLUTION_TILED_NUMBER, oZstep, ocDiv4); + MNNAddC4WithStride(srcUnit, dstUnit, 4 * eP, oZstep, ocDiv4); } } } @@ -512,7 +568,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector& inputs, threadFunction((int)threadId); } MNN_CONCURRENCY_END(); - postFunction(dstOrigin, mBias->host(), ow * oh, ocDiv4); + MNNAxByClampBroadcastUnit(dstOrigin, dstOrigin, mBias->host(), ow * oh, ow * oh * 4, ow * oh * 4, ocDiv4, mPostParameters.data()); } return NO_ERROR; diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.hpp b/source/backend/cpu/compute/DeconvolutionWithStride.hpp index 987f2d13..ed96e93d 100644 --- a/source/backend/cpu/compute/DeconvolutionWithStride.hpp +++ b/source/backend/cpu/compute/DeconvolutionWithStride.hpp @@ -50,6 +50,7 @@ private: void _extract(const Op *convOp); std::shared_ptr mSrcBuffer; + std::shared_ptr mMatMulPackBuffer; std::map> mTransformedBuffer; std::shared_ptr mDestBuffer; @@ -58,6 +59,7 @@ private: std::mutex mLock; int mStrideX = 1; int mStrideY = 1; + std::vector mPostParameters; }; } // namespace MNN diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp index befb4ffc..8225088f 100644 --- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp +++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp @@ -172,7 +172,7 @@ void MNNConvRunForUnitDepthWiseInt8(float* dst, const int8_t* src, const int8_t* } } -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) inline int8_t MNNInt32ToInt8T(int data, int bias, float scale) { float value = (float)(data + bias) * scale; diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h index 838add46..35aae2bb 100644 --- a/source/backend/cpu/compute/Int8FunctionsOpt.h +++ b/source/backend/cpu/compute/Int8FunctionsOpt.h @@ -72,7 +72,7 @@ struct QuanPostTreatParameters { void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount); void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount); -#if defined(__aarch64__) && defined(ENABLE_ARMV82) +#if defined(ENABLE_ARMV82) && (defined(__ANDROID__) || defined(__aarch64__)) void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t realDstCount, const QuanPostTreatParameters* parameters); // default TILE size #define DST_XUNIT_ARMV82 16 diff --git a/source/backend/cpu/compute/StrassenMatmulComputor.cpp b/source/backend/cpu/compute/StrassenMatmulComputor.cpp index d7192987..5a3bbad6 100644 --- a/source/backend/cpu/compute/StrassenMatmulComputor.cpp +++ b/source/backend/cpu/compute/StrassenMatmulComputor.cpp @@ -7,53 +7,20 @@ // #include "StrassenMatmulComputor.hpp" +#include "CommonOptFunction.h" #include "backend/cpu/CPUBackend.hpp" #include -#include "ConvOpt.h" #include -#include "CommonOptFunction.h" +#include "core/AutoStorage.h" #include "core/Macro.h" #include "core/Concurrency.h" //#define MNN_OPEN_TIME_TRACE #include #include "math/Vec.hpp" #include "math/Matrix.hpp" -using Vec4 = MNN::Math::Vec; -extern "C" { -void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, - size_t eSub, size_t hSub); -} - -#ifndef MNN_USE_NEON -void MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, - size_t eSub, size_t hSub) { - for (int y=0; y PTensor; +typedef AutoRelease PTensor; class StrassenMatrixComputor::AddTensor { public: AddTensor(Tensor* t, Backend* bn, Backend::StorageType storageType = Backend::DYNAMIC) { @@ -77,7 +44,7 @@ public: } private: - std::shared_ptr mTensor; + AutoRelease mTensor; Backend* mBackend; bool mValid = false; Backend::StorageType mStorageType; @@ -94,37 +61,34 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(const Tensor* AT, const // Generate Trival Matrix Multiply auto e = AT->length(1); MNN_ASSERT(e > 0); - auto aHost = AT->host(); - auto bHost = BT->host(); - auto cHost = CT->host(); + auto core = static_cast(backend())->functions(); + int bytes = core->bytes; + auto packedA = core->MNNPackC4ForMatMul_A; + auto matmul = core->MNNPackedMatMul; + auto matmulr = core->MNNPackedMatMulRemain; + auto aHost = AT->host(); + auto bHost = BT->host(); + auto cHost = CT->host(); auto aStride = AT->stride(0); auto bStride = BT->stride(0); auto cStride = CT->stride(0); int eP, lP, hP; - MNNGetMatMulPackMode(&eP, &lP, &hP); + core->MNNGetMatMulPackMode(&eP, &lP, &hP); + auto l = BT->length(1); auto numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1; - auto CONVOLUTION_TILED_NUMBER = eP; auto bExtraStride = bStride - BT->length(1) * BT->length(2); - AddTensor tileBuffer(Tensor::createDevice(std::vector{numberThread, BT->length(1), CONVOLUTION_TILED_NUMBER}), backend()); - std::vector cachePtr(numberThread, nullptr); - if (hP % 4 != 0) { - auto hDiv = MNNGetC4DivNumber(hP); - AddTensor matmulTempBuffer(Tensor::createDevice(std::vector{numberThread, eP * hDiv * 4 + CT->length(0) * eP * 4}), backend()); - for (int i=0; ihost() + i * matmulTempBuffer->stride(0); - } - } - auto tileHostOrigin = tileBuffer->host(); - int unitNumber = e / CONVOLUTION_TILED_NUMBER; - int xCount = e - unitNumber * CONVOLUTION_TILED_NUMBER; + AddTensor tileBuffer(Tensor::createDevice(std::vector{numberThread, UP_DIV(l, lP) * eP * lP * bytes}), backend()); + auto tileHostOrigin = tileBuffer->host(); + int unitNumber = e / eP; + int xCount = e - unitNumber * eP; std::vector parameters(6); - auto hMin = std::min(CT->length(0) * 4, BT->length(0) * hP); - parameters[0] = xCount * sizeof(float); - parameters[1] = BT->length(1); + auto hMin = std::min(CT->length(0) * core->pack, BT->length(0) * hP); + parameters[0] = xCount * bytes; + parameters[1] = l; parameters[2] = hMin; - parameters[3] = cStride * sizeof(float); + parameters[3] = cStride * bytes; parameters[4] = 0; - parameters[5] = bExtraStride * sizeof(float); + parameters[5] = bExtraStride * bytes; auto eReal = aStride / AT->length(2); const float* biasPtr = nullptr; if (nullptr != COT) { @@ -134,41 +98,55 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(const Tensor* AT, const } mFunctions.emplace_back( - std::make_pair([xCount, aHost, bHost, cHost, tileHostOrigin, unitNumber, bExtraStride, numberThread, parameters, eReal, CONVOLUTION_TILED_NUMBER, cachePtr, biasPtr, active](int tId) { - auto tileHost = tileHostOrigin + CONVOLUTION_TILED_NUMBER * parameters[1] * tId; + std::make_pair([xCount, aHost, bHost, cHost, tileHostOrigin, unitNumber, bExtraStride, numberThread, parameters, eReal, eP, biasPtr, active, packedA, matmul, matmulr, core](int tId) { + auto tileHost = tileHostOrigin + eP * parameters[1] * tId * core->bytes; const float* postParametersPtr = nullptr; if (!active.empty()) { postParametersPtr = active.data(); } - auto cache = cachePtr[tId]; + auto packUnit = core->bytes * core->pack; + int32_t info[4]; + int32_t stride[4]; + stride[0] = eP; + stride[1] = parameters[1]; + stride[2] = 0; + stride[3] = 0; + info[0] = 1; + info[1] = eReal; + info[2] = eP; + info[3] = 1; for (int i = tId; i < unitNumber; i+=numberThread) { - int xStart = i * CONVOLUTION_TILED_NUMBER; - auto aStart = aHost + xStart * 4; - MNNPackC4ForMatMul_A(tileHost, aStart, CONVOLUTION_TILED_NUMBER, parameters[1], eReal); - MNNPackedMatMul(cHost + 4 * xStart, tileHost, bHost, parameters.data(), cache, postParametersPtr, biasPtr); + int xStart = i * eP; + auto aStart = aHost + xStart * packUnit; + packedA((float*)(tileHost), (const float**)(&aStart), info, stride); + matmul((float*)(cHost + xStart * packUnit), (float*)tileHost, (float*)bHost, parameters.data(), postParametersPtr, biasPtr); } if (tId != numberThread -1) { return; } if (xCount > 0) { - int xStart = unitNumber * CONVOLUTION_TILED_NUMBER; - auto aStart = aHost + xStart * 4; + stride[0] = xCount; + stride[1] = parameters[1]; + info[2] = xCount; + + int xStart = unitNumber * eP; + auto aStart = aHost + xStart * packUnit; // Copy - MNNPackC4ForMatMul_A(tileHost, aStart, xCount, parameters[1], eReal); - MNNPackedMatMulRemain(cHost + 4 * xStart, tileHost, bHost, xCount, parameters.data(), cache, postParametersPtr, biasPtr); + packedA((float*)(tileHost), (const float**)(&aStart), info, stride); + matmulr((float*)(cHost + xStart * packUnit), (float*)tileHost, (float*)bHost, xCount, parameters.data(), postParametersPtr, biasPtr); } }, numberThread)); return NO_ERROR; } -#define MNNMATRIX_SUB_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub) \ +#define MNNMATRIX_SUB_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub, core) \ for (int y = tId; y < lSub; y+=numberThread) {\ -MNNMatrixSub(c + y * cStride, a + y * aStride, b + y * bStride, widthC4, 0, 0, 0, 1);\ +core->MNNMatrixSub((float*)(c + y * cStride * core->bytes), (float*)(a + y * aStride * core->bytes), (float*)(b + y * bStride * core->bytes), widthC4, 0, 0, 0, 1);\ }\ -#define MNNMATRIX_ADD_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub) \ +#define MNNMATRIX_ADD_MULTITHREAD(c, a, b, widthC4, cStride, aStride, bStride, lSub, core) \ for (int y = tId; y < lSub; y+=numberThread) {\ -MNNMatrixAdd(c + y * cStride, a + y * aStride, b + y * bStride, widthC4, 0, 0, 0, 1);\ +core->MNNMatrixAdd((float*)(c + y * cStride * core->bytes), (float*)(a + y * aStride * core->bytes), (float*)(b + y * bStride * core->bytes), widthC4, 0, 0, 0, 1);\ }\ @@ -177,33 +155,36 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor auto e = AT->length(1); auto h = CT->length(0); auto lReal = BT->length(1); - static const int aUnit = 4; + auto core = static_cast(backend())->functions(); + auto aUnit = core->pack; auto numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1; int eP, lP, hP; - MNNGetMatMulPackMode(&eP, &lP, &hP); - auto hDiv = MNNGetC4DivNumber(hP); + core->MNNGetMatMulPackMode(&eP, &lP, &hP); + MNN_ASSERT(hP % core->pack == 0); + auto hDiv = hP / core->pack; auto eSub = (e / eP) / 2 * eP; auto lSub = l / 2; auto hSub = (h / hDiv) / 2 * hDiv; auto remainH = h - hSub * 2; auto remainE = e - eSub * 2; - if (currentDepth >= mMaxDepth || eSub == 0 || hSub == 0 || lReal % 8 != 0) { + auto lMinDiv = std::max(core->pack * 2, 2 * lP); + if (currentDepth >= mMaxDepth || eSub == 0 || hSub == 0 || lReal % lMinDiv != 0) { return _generateTrivalMatMul(AT, BT, CT, COT, postParameters); } /* Compute the memory read / write cost for expand */ - auto bLSub = lSub * 4; - auto bHSub = (hSub * 4) / hP; + auto bLSub = lSub * core->pack; + auto bHSub = (hSub * core->pack) / hP; float AComputeCost = 4 * ((float)eSub * lSub) * aUnit; float BComputeCost = 4 * (float)bLSub * bHSub * hP; float CComputeCost = 7 * (float)eSub * hSub * aUnit; float saveMatMulCost = (e / eP) * (aUnit * eP * hSub + lSub * eP * aUnit + bLSub * bHSub * hP); - const float pernaty = 1.5f;//FIXME: Find beter way to set it + const float penalty = core->penalty;//FIXME: Find beter way to set it //MNN_PRINT("%f - %f, %f, %f\n", saveMatMulCost, AComputeCost, BComputeCost, CComputeCost); - float saveCost = saveMatMulCost - (AComputeCost + BComputeCost + CComputeCost) * pernaty; + float saveCost = saveMatMulCost - (AComputeCost + BComputeCost + CComputeCost) * penalty; if (saveCost <= 0.0f) { return _generateTrivalMatMul(AT, BT, CT, COT, postParameters); } @@ -231,26 +212,26 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor PTensor X(Tensor::create(AS, XReal->host())); PTensor CX(Tensor::create(CS, XReal->host())); - auto xAddr = X->host(); - auto yAddr = Y->host(); + auto xAddr = X->host(); + auto yAddr = Y->host(); auto aStride = AT->stride(0); - auto a11 = AT->host() + 0 * aUnit * eSub + 0 * aStride * lSub; - auto a12 = AT->host() + 0 * aUnit * eSub + 1 * aStride * lSub; - auto a21 = AT->host() + 1 * aUnit * eSub + 0 * aStride * lSub; - auto a22 = AT->host() + 1 * aUnit * eSub + 1 * aStride * lSub; + auto a11 = AT->host() + (0 * aUnit * eSub + 0 * aStride * lSub) * core->bytes; + auto a12 = AT->host() + (0 * aUnit * eSub + 1 * aStride * lSub) * core->bytes; + auto a21 = AT->host() + (1 * aUnit * eSub + 0 * aStride * lSub) * core->bytes; + auto a22 = AT->host() + (1 * aUnit * eSub + 1 * aStride * lSub) * core->bytes; auto bStride = BT->stride(0); - auto b11 = BT->host() + 0 * bUnit * bLSub + 0 * bStride * bHSub; - auto b12 = BT->host() + 0 * bUnit * bLSub + 1 * bStride * bHSub; - auto b21 = BT->host() + 1 * bUnit * bLSub + 0 * bStride * bHSub; - auto b22 = BT->host() + 1 * bUnit * bLSub + 1 * bStride * bHSub; + auto b11 = BT->host() + (0 * bUnit * bLSub + 0 * bStride * bHSub) * core->bytes; + auto b12 = BT->host() + (0 * bUnit * bLSub + 1 * bStride * bHSub) * core->bytes; + auto b21 = BT->host() + (1 * bUnit * bLSub + 0 * bStride * bHSub) * core->bytes; + auto b22 = BT->host() + (1 * bUnit * bLSub + 1 * bStride * bHSub) * core->bytes; auto cStride = CT->stride(0); - auto c11 = CT->host() + 0 * aUnit * eSub + 0 * cStride * hSub; - auto c12 = CT->host() + 0 * aUnit * eSub + 1 * cStride * hSub; - auto c21 = CT->host() + 1 * aUnit * eSub + 0 * cStride * hSub; - auto c22 = CT->host() + 1 * aUnit * eSub + 1 * cStride * hSub; + auto c11 = CT->host() + (0 * aUnit * eSub + 0 * cStride * hSub) * core->bytes; + auto c12 = CT->host() + (0 * aUnit * eSub + 1 * cStride * hSub) * core->bytes; + auto c21 = CT->host() + (1 * aUnit * eSub + 0 * cStride * hSub) * core->bytes; + auto c22 = CT->host() + (1 * aUnit * eSub + 1 * cStride * hSub) * core->bytes; PTensor A11(Tensor::create(AS, a11)); A11->setStride(0, aStride); @@ -281,9 +262,9 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor { // S3=A11-A21, T3=B22-B12, P7=S3*T3 - auto f = [a11, a21, b22, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub](int tId) { - MNNMATRIX_SUB_MULTITHREAD(xAddr, a11, a21, eSub * aUnit / 4, eSub * aUnit, aStride, aStride, lSub); - MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, b12, bLSub * bUnit / 4, bLSub * bUnit, bStride, bStride, bHSub); + auto f = [a11, a21, b22, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub, core](int tId) { + MNNMATRIX_SUB_MULTITHREAD(xAddr, a11, a21, eSub, eSub * core->pack, aStride, aStride, lSub, core); + MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, b12, bLSub * bUnit / core->pack, bLSub * bUnit, bStride, bStride, bHSub, core); }; mFunctions.emplace_back(std::make_pair(f, numberThread)); auto code = _generateMatMul(X.get(), Y.get(), C21.get(), nullptr, currentDepth, {}); @@ -293,9 +274,9 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor } { // S1=A21+A22, T1=B12-B11, P5=S1T1 - auto f = [a22, a21, b11, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub](int tId) { - MNNMATRIX_ADD_MULTITHREAD(xAddr, a21, a22, eSub * aUnit / 4, eSub * aUnit, aStride, aStride, lSub); - MNNMATRIX_SUB_MULTITHREAD(yAddr, b12, b11, bLSub * bUnit / 4, bLSub * bUnit, bStride, bStride, bHSub); + auto f = [a22, a21, b11, b12, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub, core](int tId) { + MNNMATRIX_ADD_MULTITHREAD(xAddr, a21, a22, eSub, eSub * core->pack, aStride, aStride, lSub, core); + MNNMATRIX_SUB_MULTITHREAD(yAddr, b12, b11, bLSub * bUnit / core->pack, bLSub * bUnit, bStride, bStride, bHSub, core); }; mFunctions.emplace_back(std::make_pair(f, numberThread)); auto code = _generateMatMul(X.get(), Y.get(), C22.get(), nullptr, currentDepth, {}); @@ -305,9 +286,9 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor } { // S2=S1-A11, T2=B22-T1, P6=S2T2 - auto f = [a11, b22, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub](int tId) { - MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, a11, eSub * aUnit / 4, eSub * aUnit, eSub * aUnit, aStride, lSub); - MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, yAddr, bLSub * bUnit / 4, bLSub * bUnit, bStride, bLSub * bUnit, bHSub); + auto f = [a11, b22, xAddr, yAddr, eSub, lSub, hSub, aStride, bStride, numberThread, bUnit, bLSub, bHSub, core](int tId) { + MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, a11, eSub, eSub * core->pack, eSub * core->pack, aStride, lSub, core); + MNNMATRIX_SUB_MULTITHREAD(yAddr, b22, yAddr, bLSub * bUnit / core->pack, bLSub * bUnit, bStride, bLSub * bUnit, bHSub, core); }; mFunctions.emplace_back(std::make_pair(f, numberThread)); auto code = _generateMatMul(X.get(), Y.get(), C12.get(), nullptr, currentDepth, {}); @@ -317,8 +298,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor } { // S4=A12-S2, P3=S4*B22, P1=A11*B11 - auto f = [a12, xAddr, eSub, lSub, aStride, numberThread](int tId) { - MNNMATRIX_SUB_MULTITHREAD(xAddr, a12, xAddr, eSub * aUnit / 4, eSub * aUnit, aStride, eSub * aUnit, lSub); + auto f = [a12, xAddr, eSub, lSub, aStride, numberThread, core](int tId) { + MNNMATRIX_SUB_MULTITHREAD(xAddr, a12, xAddr, eSub, eSub * core->pack, aStride, eSub * core->pack, lSub, core); }; mFunctions.emplace_back(std::make_pair(f, numberThread)); auto code = _generateMatMul(X.get(), B22.get(), C11.get(), nullptr, currentDepth, {}); @@ -333,11 +314,11 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor { // U2=P1+P6, U3=U2+P7, U4=U2+P5, U7=U3+P5 // U5=U4+P3, T4=T2-B21, P4=A22*T4 - auto f = [c11, c12, c21, c22, b21, xAddr, yAddr, eSub, lSub, hSub, bStride, cStride, numberThread, bUnit, bHSub, bLSub](int tId) { + auto f = [c11, c12, c21, c22, b21, xAddr, yAddr, eSub, lSub, hSub, bStride, cStride, numberThread, bUnit, bHSub, bLSub, core](int tId) { for (int y = tId; y < hSub; y+=numberThread) { - MNNStrassenMergeCFunction(c11 + y * cStride, c12 + y * cStride, c21 + y * cStride, c22 + y * cStride, xAddr + y * eSub * 4, 0, eSub, 1); + core->MNNStrassenMergeCFunction((float*)(c11 + y * cStride * core->bytes), (float*)(c12 + y * cStride * core->bytes), (float*)(c21 + y * cStride * core->bytes), (float*)(c22 + y * cStride * core->bytes), (float*)(xAddr + y * eSub * core->pack * core->bytes), 0, eSub, 1); } - MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, b21, bLSub * bUnit / 4, bLSub * bUnit, bLSub * bUnit, bStride, bHSub); + MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, b21, bLSub * bUnit / core->pack, bLSub * bUnit, bLSub * bUnit, bStride, bHSub, core); }; mFunctions.emplace_back(std::make_pair(f, numberThread)); auto code = _generateMatMul(A22.get(), Y.get(), C11.get(), nullptr, currentDepth, {}); @@ -347,35 +328,35 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor } { // U6=U3-P4, P2=A12*B21, U1=P1+P2 - auto f0 = [c11, c21, eSub, hSub, cStride, numberThread](int tId) { - auto cw = eSub * aUnit / 4; - MNNMATRIX_SUB_MULTITHREAD(c21, c21, c11, cw, cStride, cStride, cStride, hSub); + auto f0 = [c11, c21, eSub, hSub, cStride, numberThread, core](int tId) { + auto cw = eSub; + MNNMATRIX_SUB_MULTITHREAD(c21, c21, c11, cw, cStride, cStride, cStride, hSub, core); }; mFunctions.emplace_back(std::make_pair(f0, numberThread)); auto code = _generateMatMul(A12.get(), B21.get(), C11.get(), nullptr, currentDepth, {}); if (code != NO_ERROR) { return code; } - auto f1 = [c11, xAddr, eSub, hSub, cStride, numberThread](int tId) { - auto cw = eSub * aUnit / 4; - MNNMATRIX_ADD_MULTITHREAD(c11, c11, xAddr, cw, cStride, cStride, eSub * aUnit, hSub); + auto f1 = [c11, xAddr, eSub, hSub, cStride, numberThread, core](int tId) { + auto cw = eSub; + MNNMATRIX_ADD_MULTITHREAD(c11, c11, xAddr, cw, cStride, cStride, eSub * core->pack, hSub, core); }; mFunctions.emplace_back(std::make_pair(f1, numberThread)); if (!postParameters.empty() && nullptr != COT) { auto biasPtr = COT->host(); if (1 == numberThread) { - auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters](int tId) { + auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters, core](int tId) { auto width = eSub * 2; auto height = hSub * 2; - MNNAxByClampBroadcastC4(c11, c11, biasPtr, width, cStride, cStride, height, postParameters.data()); + core->MNNAxByClampBroadcastUnit((float*)c11, (float*)c11, biasPtr, width, cStride, cStride, height, postParameters.data()); }; mFunctions.emplace_back(std::make_pair(postFunction, numberThread)); } else { - auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters](int tId) { + auto postFunction = [c11, eSub, hSub, cStride, numberThread, biasPtr, postParameters, core](int tId) { auto width = eSub * 2; auto height = hSub * 2; for (int y = tId; y < height; y+=numberThread) { - MNNAxByClampBroadcastC4(c11 + y * cStride, c11 + y * cStride, biasPtr + y * 4, width, 0, 0, 1, postParameters.data()); + core->MNNAxByClampBroadcastUnit((float*)(c11 + y * cStride * core->bytes), (float*)(c11 + y * cStride * core->bytes), (const float*)((uint8_t*)biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data()); } }; mFunctions.emplace_back(std::make_pair(postFunction, numberThread)); @@ -384,16 +365,16 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor } if (remainH > 0) { auto lastH = hSub * 2; - auto cLast = CT->host() + cStride * lastH; + auto cLast = CT->host() + cStride * lastH * core->bytes; auto lastHB = bHSub * 2; - auto bLast = BT->host() + bStride * lastHB; + auto bLast = BT->host() + bStride * lastHB * core->bytes; PTensor BLast(Tensor::create(std::vector{BT->length(0) - lastHB, BT->length(1), bUnit}, bLast)); PTensor CLast(Tensor::create(std::vector{remainH, eSub * 2, aUnit}, cLast)); PTensor ALast(Tensor::create(std::vector{l, eSub * 2, aUnit}, AT->host())); PTensor biasWrap; const Tensor* bias = COT; if (nullptr != bias) { - biasWrap.reset(Tensor::create(std::vector{remainH, 1, aUnit}, COT->host() + 4 * lastH)); + biasWrap.reset(Tensor::create(std::vector{remainH, 1, aUnit}, COT->host() + core->bytes * core->pack * lastH)); bias = biasWrap.get(); } BLast->setStride(0, bStride); @@ -405,8 +386,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(const Tensor* AT, const Tensor } } if (remainE > 0) { - auto aLast = AT->host() + eSub * 2 * aUnit; - auto cLast = CT->host() + eSub * 2 * aUnit; + auto aLast = AT->host() + eSub * 2 * aUnit * core->bytes; + auto cLast = CT->host() + eSub * 2 * aUnit * core->bytes; PTensor ALast(Tensor::create(std::vector{l, remainE, aUnit}, aLast)); PTensor CLast(Tensor::create(std::vector{h, remainE, aUnit}, cLast)); ALast->setStride(0, aStride); diff --git a/source/backend/cpu/x86_x64/CMakeLists.txt b/source/backend/cpu/x86_x64/CMakeLists.txt index 2759aeb8..1b51143b 100644 --- a/source/backend/cpu/x86_x64/CMakeLists.txt +++ b/source/backend/cpu/x86_x64/CMakeLists.txt @@ -7,8 +7,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64) FILE(GLOB MNN_X8664_SRC ${CMAKE_CURRENT_LIST_DIR}/*) if (MSVC) FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp) + FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*.cpp) else() FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*) + FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*) if (MNN_AVX512) FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*) add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC}) @@ -18,14 +20,23 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64) FILE(GLOB MNN_SSE_SRC ${CMAKE_CURRENT_LIST_DIR}/sse/*) add_library(MNNX8664 OBJECT ${MNN_X8664_SRC}) add_library(MNNAVX OBJECT ${MNN_AVX_SRC}) + add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC}) add_library(MNNSSE OBJECT ${MNN_SSE_SRC}) if(MSVC) target_compile_options(MNNAVX PRIVATE /arch:AVX) + target_compile_options(MNNAVXFMA PRIVATE /arch:AVX) else() target_compile_options(MNNSSE PRIVATE -msse4.1) - target_compile_options(MNNAVX PRIVATE -mavx2 -mfma -DMNN_X86_USE_ASM) + target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM) + target_compile_options(MNNAVXFMA PRIVATE -mavx2 -mfma -DMNN_X86_USE_ASM) endif() - list(APPEND MNN_OBJECTS_TO_LINK $ $ $) + if (MNN_SUPPORT_BF16) + target_compile_options(MNNAVXFMA PRIVATE -DMNN_SUPPORT_BF16) + if (MNN_SSE_USE_FP16_INSTEAD) + target_compile_options(MNNAVXFMA PRIVATE -DMNN_SSE_USE_FP16_INSTEAD -mf16c) + endif() + endif() + list(APPEND MNN_OBJECTS_TO_LINK $ $ $ $) if (MNN_AVX512) target_compile_options(MNNX8664 PRIVATE -DMNN_AVX512) list(APPEND MNN_OBJECTS_TO_LINK $) diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp index 4e9b1c2f..083aab9f 100644 --- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp +++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp @@ -9,6 +9,7 @@ #include #include "avx512/FunctionSummary.hpp" #include "avx/FunctionSummary.hpp" +#include "avxfma/FunctionSummary.hpp" #include "backend/cpu/compute/CommonOptFunction.h" #include "backend/cpu/compute/ConvOpt.h" #include "backend/cpu/compute/Int8FunctionsOpt.h" @@ -30,33 +31,6 @@ struct FunctionGroup { int eP = 12; int lP = 1; int hP = 4; - void (*MNNAddBias)(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) = _SSE_MNNAddBias; - void (*MNNAddBiasRelu)(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) = _SSE_MNNAddBiasRelu; - void (*MNNAddBiasRelu6)(float* dst, const float* bias, size_t planeNumber, - size_t biasNumber) = _SSE_MNNAddBiasRelu6; - - void (*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, - size_t bStride, size_t height) = _SSE_MNNMatrixAdd; - void (*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, - size_t bStride, size_t height) = _SSE_MNNMatrixSub; - - void (*MNNGemmFloatUnit_4)(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, - size_t dst_step, size_t dst_depth_quad, - size_t weight_depth_offset) = _SSE_MNNGemmFloatUnit_4; - void (*MNNGemmFloatCommon_4)(float* dst, const float* src, const float* weight, size_t src_depth_quad, - size_t dst_step, size_t dst_depth_quad, size_t width, - size_t weight_depth_offset) = _SSE_MNNGemmFloatCommon_4; - void (*MNNPackC4ForMatMul_A)(float* dest, const float* source, size_t e, size_t l, - size_t eReal) = _SSE_MNNPackC4ForMatMul_A; - void (*MNNPackForMatMul_B)(float* dest, const float* source, size_t h, size_t l, bool transpose) = _SSE_MNNPackForMatMul_B; - void (*MNNPackedMatMul)(float* C, const float* A, const float* B, const size_t* parameter, float* cache, - const float* postParameters, const float* bias) = _SSE_MNNPackedMatMul; - void (*MNNPackedMatMulRemain)(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, - float* cache, const float* postParameters, - const float* bias) = _SSE_MNNPackedMatMulRemain; - void (*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, - size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) = _SSE_MNNConvRunForLineDepthwise; void (*MNNGemmInt8AddBiasScale_16x4_Unit)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) = _SSE_MNNGemmInt8AddBiasScale_16x4_Unit; void (*MNNGemmInt8AddBiasScale_16x4_Unit_FAST)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) = _SSE_MNNGemmInt8AddBiasScale_16x4_Unit; void (*MNNExpC8)(float* dest, const float* source, const float* parameters, size_t countC8) = _SSE_MNNExpC8; @@ -65,24 +39,45 @@ struct FunctionGroup { void (*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint) = _SSE_MNNInt8ScaleToFloat; void (*MNNLineDepthWiseInt8AddBiasScaleUnit)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) = _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit; void (*MNNComputeMatMulForE_1)(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) = _SSE_MNNComputeMatMulForE_1; + void (*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) = _SSE_MNNReluWithSlopeChannel; + void (*MNNReluInt8)(int8_t* dst, const int8_t* src, size_t size) = _SSE_MNNReluInt8; + void (*MNNHardSwish)(float* dst, const float* src, size_t size) = _SSE_MNNHardSwish; }; static FunctionGroup gFunc; + +void _SSEMNNGetMatMulPackMode(int* eP, int *lP, int* hP) { + *eP = gFunc.eP; + *lP = gFunc.lP; + *hP = gFunc.hP; +} void MNNFunctionInit() { auto cpuFlags = libyuv::InitCpuFlags(); + auto coreFunction = MNN::MNNGetCoreFunctions(); + if (cpuFlags & libyuv::kCpuHasSSSE3) { + coreFunction->MNNGetMatMulPackMode = _SSEMNNGetMatMulPackMode; + coreFunction->MNNMatrixAdd = _SSE_MNNMatrixAdd; + coreFunction->MNNMatrixSub = _SSE_MNNMatrixSub; + coreFunction->MNNPackedMatMul = _SSE_MNNPackedMatMul; + coreFunction->MNNPackedMatMulRemain = _SSE_MNNPackedMatMulRemain; + coreFunction->MNNPackC4ForMatMul_A = _SSE_MNNPackC4ForMatMul_A; + coreFunction->MNNPackForMatMul_B = _SSE_MNNPackForMatMul_B; + coreFunction->MNNConvRunForLineDepthwise = _SSE_MNNConvRunForLineDepthwise; + coreFunction->MNNAxByClampBroadcastUnit = _SSE_MNNAxByClampBroadcastUnit; + } if (cpuFlags & libyuv::kCpuHasAVX2) { - gFunc.MNNAddBias = _AVX_MNNAddBias; - gFunc.MNNAddBiasRelu = _AVX_MNNAddBiasRelu; - gFunc.MNNAddBiasRelu6 = _AVX_MNNAddBiasRelu6; - gFunc.MNNMatrixAdd = _AVX_MNNMatrixAdd; - gFunc.MNNMatrixSub = _AVX_MNNMatrixSub; - gFunc.MNNGemmFloatUnit_4 = _AVX_MNNGemmFloatUnit_4; - gFunc.MNNGemmFloatCommon_4 = _AVX_MNNGemmFloatCommon_4; - gFunc.MNNPackedMatMul = _AVX_MNNPackedMatMul; - gFunc.MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemain; gFunc.eP = 24; - gFunc.MNNPackC4ForMatMul_A = _AVX_MNNPackC4ForMatMul_A; - gFunc.MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise; + gFunc.lP = 1; + gFunc.hP = 4; + + coreFunction->MNNMatrixAdd = _AVX_MNNMatrixAdd; + coreFunction->MNNMatrixSub = _AVX_MNNMatrixSub; + coreFunction->MNNPackedMatMul = _AVX_MNNPackedMatMul; + coreFunction->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemain; + coreFunction->MNNPackC4ForMatMul_A = _AVX_MNNPackC4ForMatMul_A; + coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise; + coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit; + gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit; gFunc.MNNExpC8 = _AVX_MNNExpC8; gFunc.MNNFloat2Int8 = _AVX_MNNFloat2Int8; @@ -90,22 +85,22 @@ void MNNFunctionInit() { gFunc.MNNLineDepthWiseInt8AddBiasScaleUnit = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit; gFunc.MNNComputeMatMulForE_1 = _AVX_MNNComputeMatMulForE_1; gFunc.MNNGemmInt8AddBiasScale_16x4_Unit_FAST = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast; + gFunc.MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel; if (cpuFlags & libyuv::kCpuHasFMA3) { - gFunc.MNNGemmFloatUnit_4 = _AVX_MNNGemmFloatUnitFMA_4; - gFunc.MNNGemmFloatCommon_4 = _AVX_MNNGemmFloatCommonFMA_4; - gFunc.MNNPackedMatMul = _AVX_MNNPackedMatMulFMA; - gFunc.MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemainFMA; + coreFunction->MNNPackedMatMul = _AVX_MNNPackedMatMulFMA; + coreFunction->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemainFMA; gFunc.MNNComputeMatMulForE_1 = _AVX_MNNComputeMatMulForE_1FMA; } } #ifdef MNN_AVX512 if (cpuFlags & libyuv::kCpuHasAVX512VNNI) { -// gFunc.MNNPackForMatMul_B = _AVX512_MNNPackForMatMul_B; -// gFunc.MNNPackC4ForMatMul_A = _AVX512_MNNPackC4ForMatMul_A; -// gFunc.MNNPackedMatMul = _AVX512_MNNPackedMatMul; -// gFunc.MNNPackedMatMulRemain = _AVX512_MNNPackedMatMulRemain; -// gFunc.eP = 48; -// gFunc.hP = 8; + coreFunction->MNNPackForMatMul_B = _AVX512_MNNPackForMatMul_B; + coreFunction->MNNPackC4ForMatMul_A = _AVX512_MNNPackC4ForMatMul_A; + coreFunction->MNNPackedMatMul = _AVX512_MNNPackedMatMul; + coreFunction->MNNPackedMatMulRemain = _AVX512_MNNPackedMatMulRemain; + gFunc.eP = 24; + gFunc.hP = 4; + gFunc.lP = 4; gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit; gFunc.MNNGemmInt8AddBiasScale_16x4_Unit_FAST = _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit; } @@ -113,17 +108,6 @@ void MNNFunctionInit() { } // ========= CommonOptFunction.cpp =========== -void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - return gFunc.MNNAddBias(dst, bias, planeNumber, biasNumber); -} - -void MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - return gFunc.MNNAddBiasRelu(dst, bias, planeNumber, biasNumber); -} - -void MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - return gFunc.MNNAddBiasRelu6(dst, bias, planeNumber, biasNumber); -} void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) { _SSE_MNNCopyC4WithStride(source, dest, srcStride, dstStride, count); @@ -133,50 +117,18 @@ void MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size _SSE_MNNAddC4WithStride(source, dest, srcStride, dstStride, count); } -void MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t weight_depth_offset) { - gFunc.MNNGemmFloatUnit_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, weight_depth_offset); -} - -// ========= MNNGemmFloatCommon_4.cpp =========== -void MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t width, size_t weight_depth_offset) { - gFunc.MNNGemmFloatCommon_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, width, weight_depth_offset); -} - -// ========= MNNMatrixAdd.cpp =========== -void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, - size_t bStride, size_t height) { - gFunc.MNNMatrixAdd(C, A, B, widthC4, cStride, aStride, bStride, height); -} - -// ========= MNNMatrixSub.cpp =========== -void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, - size_t bStride, size_t height) { - gFunc.MNNMatrixSub(C, A, B, widthC4, cStride, aStride, bStride, height); -} - void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) { - return _SSE_MNNReluWithSlopeChannel(dst, src, slope, sizeQuad, depthQuad); + return gFunc.MNNReluWithSlopeChannel(dst, src, slope, sizeQuad, depthQuad); } -void MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) { - return gFunc.MNNPackC4ForMatMul_A(dest, source, e, l, eReal); +void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) { + return gFunc.MNNReluInt8(dst, src, size); } -void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) { - gFunc.MNNPackForMatMul_B(dest, source, h, l, transpose); +void MNNHardSwish(float* dst, const float* src, size_t size) { + return gFunc.MNNHardSwish(dst, src, size); } -void MNNGetMatMulPackMode(int* eP, int* lP, int* hP) { - *eP = gFunc.eP; - *lP = gFunc.lP; - *hP = gFunc.hP; -} - -int MNNGetConvolutionTileNumber() { - return gFunc.tileNumber; -} void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint) { return gFunc.MNNFloat2Int8(src, dst, sizeQuad, scalep, minValue, maxValue, zeroPoint); @@ -185,22 +137,9 @@ void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size return gFunc.MNNInt8ScaleToFloat(dst, src, scale, size, zeroPoint); } -void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, - const float* postParameters, const float* bias) { - return gFunc.MNNPackedMatMul(C, A, B, parameter, cache, postParameters, bias); -} -void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, - float* cache, const float* postParameters, const float* bias) { - return gFunc.MNNPackedMatMulRemain(C, A, B, eSize, parameter, cache, postParameters, bias); -} void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) { gFunc.MNNExpC8(dest, source, parameters, countC8); } -void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, - size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) { - return gFunc.MNNConvRunForLineDepthwise(dst, src, weight, width, src_w_setup, fw, fh, dilateX_step, dilateY_step, height, srcHStep, dstHStep); -} void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) { return gFunc.MNNGemmInt8AddBiasScale_16x4_Unit(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, post, realDst); diff --git a/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp b/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp index 984f8ff2..4af6d4b4 100644 --- a/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp +++ b/source/backend/cpu/x86_x64/avx/CommonOptFunction.cpp @@ -13,78 +13,68 @@ #include #include "FunctionSummary.hpp" #include "core/Macro.h" -void _AVX_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - if (planeNumber == 0) { - return; - } - for (int z = 0; z < biasNumber; ++z) { - auto biasV = _mm256_broadcast_ps((const __m128*)(bias + 4 * z)); - float* dst_z = dst + planeNumber * 4 * z; - for (int p = 0; p < planeNumber - 1; p += 2) { - auto dstV = _mm256_add_ps(_mm256_loadu_ps(dst_z + 4 * p), biasV); - _mm256_storeu_ps(dst_z + 4 * p, dstV); +void _AVX_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) { + auto zero = _mm_set1_ps(0.0f); + auto zero2 = _mm256_set1_ps(0.0f); + int sizeC8 = sizeQuad / 2; + int sizeRemain = sizeQuad % 2; + for (int j = 0; j < depthQuad; j++) { + auto slopeZ = _mm_loadu_ps(slope + 4 * j); + auto slopeZ2 = _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128(slopeZ))); + const float* srcZ = src + 4 * j * sizeQuad; + float* dstZ = dst + 4 * j * sizeQuad; + for (int i = 0; i < sizeC8; i++) { + auto src = _mm256_loadu_ps(srcZ); + auto mask0 = _mm256_cmp_ps(src, zero2, 0x01); + auto mask1 = _mm256_cmp_ps(src, zero2, 0x0D); + auto other = _mm256_mul_ps(src, slopeZ2); + _mm256_storeu_ps(dstZ, _mm256_add_ps(_mm256_and_ps(other, mask0), _mm256_and_ps(src, mask1))); + srcZ += 8; + dstZ += 8; } - if (planeNumber % 2 == 1) { - _mm256_zeroall(); - auto biasV = _mm_loadu_ps(bias + 4 * z); - auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * (planeNumber - 1)), biasV); - _mm_storeu_ps(dst_z + 4 * (planeNumber - 1), dstV); + for (int i = 0; i < sizeRemain; i++) { + auto src = _mm_loadu_ps(srcZ + 4 * i); + auto mask0 = _mm_cmplt_ps(src, zero); + auto mask1 = _mm_cmpge_ps(src, zero); + auto other = _mm_mul_ps(src, slopeZ); + _mm_storeu_ps(dstZ + 4 * i, _mm_add_ps(_mm_and_ps(other, mask0), _mm_and_ps(src, mask1))); } } - _mm256_zeroall(); } -void _AVX_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - if (planeNumber == 0) { - return; - } - auto maxV = _mm256_set1_ps(0.0f); - for (int z = 0; z < biasNumber; ++z) { - auto biasV = _mm256_broadcast_ps((const __m128*)(bias + 4 * z)); - float* dst_z = dst + planeNumber * 4 * z; - for (int p = 0; p < planeNumber - 1; p += 2) { - auto dstV = _mm256_add_ps(_mm256_loadu_ps(dst_z + 4 * p), biasV); - dstV = _mm256_max_ps(dstV, maxV); - _mm256_storeu_ps(dst_z + 4 * p, dstV); - } - if (planeNumber % 2 == 1) { - _mm256_zeroall(); - auto biasV = _mm_loadu_ps(bias + 4 * z); - auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * (planeNumber - 1)), biasV); - dstV = _mm_max_ps(dstV, _mm_set1_ps(0.0f)); - _mm_storeu_ps(dst_z + 4 * (planeNumber - 1), dstV); - maxV = _mm256_set1_ps(0.0f); - } - } - _mm256_zeroall(); -} -void _AVX_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - if (planeNumber == 0) { - return; - } - auto maxV = _mm256_set1_ps(0.0f); - auto minV = _mm256_set1_ps(6.0f); - for (int z = 0; z < biasNumber; ++z) { - auto biasV = _mm256_broadcast_ps((const __m128*)(bias + 4 * z)); - float* dst_z = dst + planeNumber * 4 * z; - for (int p = 0; p < planeNumber - 1; p += 2) { - auto dstV = _mm256_add_ps(_mm256_loadu_ps(dst_z + 4 * p), biasV); - dstV = _mm256_max_ps(dstV, maxV); - dstV = _mm256_min_ps(dstV, minV); - _mm256_storeu_ps(dst_z + 4 * p, dstV); +void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) { + auto minF = _mm256_broadcast_ss(parameters + 2); + auto maxF = _mm256_broadcast_ss(parameters + 3); + auto beta = _mm256_broadcast_ss(parameters + 1); + auto minF1 = _mm_broadcast_ss(parameters + 2); + auto maxF1 = _mm_broadcast_ss(parameters + 3); + auto beta1 = _mm_broadcast_ss(parameters + 1); + int widthC2 = width / 2; + int widthRemain = width % 2; + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y; + auto b = B + 4 * y; + auto bv = _mm_loadu_ps(b); + auto bv2 = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_broadcastsi128_si256(_mm_castps_si128(bv)), _mm_castps_si128(bv), 1)); + auto c = C + cStride * y; + for (int x = 0; x < widthC2; ++x) { + auto av = _mm256_loadu_ps(a); + auto cv = _mm256_add_ps(av, _mm256_mul_ps(bv2, beta)); + cv = _mm256_min_ps(cv, maxF); + cv = _mm256_max_ps(cv, minF); + _mm256_storeu_ps(c, cv); + a += 8; + c += 8; } - if (planeNumber % 2 == 1) { - _mm256_zeroall(); - auto biasV = _mm_loadu_ps(bias + 4 * z); - auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * (planeNumber - 1)), biasV); - dstV = _mm_min_ps(_mm_max_ps(dstV, _mm_set1_ps(0.0f)), _mm_set1_ps(6.0f)); - _mm_storeu_ps(dst_z + 4 * (planeNumber - 1), dstV); - maxV = _mm256_set1_ps(0.0f); - minV = _mm256_set1_ps(6.0f); + if (widthRemain > 0) { + auto av = _mm_loadu_ps(a); + auto cv = _mm_add_ps(av, _mm_mul_ps(bv, beta1)); + cv = _mm_min_ps(cv, maxF1); + cv = _mm_max_ps(cv, minF1); + _mm_storeu_ps(c, cv); } } - _mm256_zeroall(); } static void _postTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters, @@ -336,6 +326,7 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl __m256 zero = _mm256_set1_ps(0); __m256 minValue = _mm256_set1_ps(minV); __m256 maxValue = _mm256_set1_ps(maxV); + __m256 zeroPointValue = _mm256_set1_ps(zeroPoint); __m256 plus = _mm256_set1_ps(0.5f); __m256 minus = _mm256_set1_ps(-0.5f); __m256 scaleValue2 = _mm256_insertf128_ps(_mm256_castps128_ps256(scaleValue), scaleValue, 1); @@ -343,6 +334,7 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl for (int i = 0; i < sizeC2; ++i) { auto f0 = _mm256_loadu_ps(src); f0 = _mm256_mul_ps(f0, scaleValue2); + f0 = _mm256_add_ps(f0, zeroPointValue); f0 = _mm256_min_ps(f0, maxValue); f0 = _mm256_max_ps(f0, minValue); // 1: _CMP_LT_OS @@ -365,11 +357,13 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl __m128i zero = _mm_set1_epi32(0); __m128 minValue = _mm_set1_ps(minV); __m128 maxValue = _mm_set1_ps(maxV); + __m128 zeroPointValue = _mm_set1_ps(zeroPoint); __m128 plus = _mm_set1_ps(0.5f); __m128 minus = _mm_set1_ps(-0.5f); alignas(16) int32_t temp[4]; __m128 f0 = _mm_loadu_ps(src); f0 = _mm_mul_ps(f0, scaleValue); + f0 = _mm_add_ps(f0, zeroPointValue); f0 = _mm_min_ps(f0, maxValue); f0 = _mm_max_ps(f0, minValue); auto m0 = _mm_cmplt_ps(f0, _mm_castsi128_ps(zero)); @@ -390,11 +384,16 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, __m128i zero = _mm_set1_epi32(0); __m128 scaleValue = _mm_loadu_ps(scale); __m256 scaleValue2 = _mm256_insertf128_ps(_mm256_castps128_ps256(scaleValue), scaleValue, 1); + __m256i zeroPointValue = _mm256_set1_epi32(zeroPoint); for (int i = 0; i < sizeC4; ++i) { auto s0 = _mm_castps_si128(_mm_loadu_ps((const float*)src)); auto s1 = _mm_unpackhi_epi64(s0, zero); - auto Sf0 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s0)); - auto Sf1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s1)); + auto st0 = _mm256_cvtepi8_epi32(s0); + auto st1 = _mm256_cvtepi8_epi32(s1); + st0 = _mm256_sub_epi32(st0, zeroPointValue); + st1 = _mm256_sub_epi32(st1, zeroPointValue); + auto Sf0 = _mm256_cvtepi32_ps(st0); + auto Sf1 = _mm256_cvtepi32_ps(st1); _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(Sf0, scaleValue2)); _mm256_storeu_ps(dst + 8 * 1, _mm256_mul_ps(Sf1, scaleValue2)); src += 16; @@ -405,8 +404,12 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, ::memcpy(srcTemp, src, sizeRemain * 4); auto s0 = *(__m128i*)srcTemp; auto s1 = _mm_unpackhi_epi64(s0, zero); - auto Sf0 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s0)); - auto Sf1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(s1)); + auto st0 = _mm256_cvtepi8_epi32(s0); + auto st1 = _mm256_cvtepi8_epi32(s1); + st0 = _mm256_sub_epi32(st0, zeroPointValue); + st1 = _mm256_sub_epi32(st1, zeroPointValue); + auto Sf0 = _mm256_cvtepi32_ps(st0); + auto Sf1 = _mm256_cvtepi32_ps(st1); switch (sizeRemain) { case 3: _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(Sf0, scaleValue2)); @@ -761,58 +764,3 @@ void _AVX_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const } } } - -void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) { - auto l = param->l; - auto h = param->h; - auto numberThread = param->numberThread; - auto lC4 = l / 8; - auto lR = lC4 * 8; - if (param->BTranspose) { - for (int y=tId; y 0) { + auto destX = (int64_t*)(dest + lC8 * eDest * 8); + auto srcX0 = (int64_t*)(source + (2 * lC8 + 0) * eReal * 4); + + for (int y=0; y #include -#define TRANPOSE_SAVE(u, v, z0, z3, z6, z9) \ - { \ - auto m0 = _mm256_extractf128_ps(z0, u); \ - auto m1 = _mm256_extractf128_ps(z3, u); \ - auto m2 = _mm256_extractf128_ps(z6, u); \ - auto m3 = _mm256_extractf128_ps(z9, u); \ - _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ - _mm_store_ps(dst + 4 * (0 + 4 * u + 8 * v), m0); \ - _mm_store_ps(dst + 4 * (1 + 4 * u + 8 * v), m1); \ - _mm_store_ps(dst + 4 * (2 + 4 * u + 8 * v), m2); \ - _mm_store_ps(dst + 4 * (3 + 4 * u + 8 * v), m3); \ - } void AVX2GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); #endif diff --git a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp index 06f70523..d12f50f5 100644 --- a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp +++ b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp @@ -5,9 +5,22 @@ // Created by MNN on b'2020/09/22'. // Copyright © 2018, Alibaba Group Holding Limited +#define TRANPOSE_SAVE(u, v, z0, z3, z6, z9) \ + { \ + auto m0 = _mm256_extractf128_ps(z0, u); \ + auto m1 = _mm256_extractf128_ps(z3, u); \ + auto m2 = _mm256_extractf128_ps(z6, u); \ + auto m3 = _mm256_extractf128_ps(z9, u); \ + _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ + STORE_4(dst + 4 * (0 + 4 * u + 8 * v), m0); \ + STORE_4(dst + 4 * (1 + 4 * u + 8 * v), m1); \ + STORE_4(dst + 4 * (2 + 4 * u + 8 * v), m2); \ + STORE_4(dst + 4 * (3 + 4 * u + 8 * v), m3); \ + } + namespace { static inline __m128i mm_loadu_si128(const void* addr) { - return _mm_loadu_si128((__m128i const*)addr); + return _mm_castps_si128(LOAD4((const float*)addr)); } static inline __m256i mm256_broadcastsi128_si256(const void* addr) { @@ -15,54 +28,54 @@ static inline __m256i mm256_broadcastsi128_si256(const void* addr) { } } // namespace // - #define INIT_MAIN_24_4 \ - auto s0 = _mm256_loadu_ps(A + 0 * 24); \ - auto s1 = _mm256_loadu_ps(A + 0 * 24 + 8); \ - auto s2 = _mm256_loadu_ps(A + 0 * 24 + 16); \ - auto w0 = _mm256_broadcast_ss(weight + 0 * 4 + 0); \ + auto s0 = LOAD8(A + 0 * 24); \ + auto s1 = LOAD8(A + 0 * 24 + 8); \ + auto s2 = LOAD8(A + 0 * 24 + 16); \ + auto w0 = BROAD_LOAD(weight + 0 * 4 + 0); \ auto z0 = _mm256_mul_ps(s0, w0); \ auto z1 = _mm256_mul_ps(s1, w0); \ auto z2 = _mm256_mul_ps(s2, w0); \ - auto w1 = _mm256_broadcast_ss(weight + 0 * 4 + 1); \ + auto w1 = BROAD_LOAD(weight + 0 * 4 + 1); \ auto z3 = _mm256_mul_ps(s0, w1); \ auto z4 = _mm256_mul_ps(s1, w1); \ auto z5 = _mm256_mul_ps(s2, w1); \ - auto w2 = _mm256_broadcast_ss(weight + 0 * 4 + 2); \ + auto w2 = BROAD_LOAD(weight + 0 * 4 + 2); \ auto z6 = _mm256_mul_ps(s0, w2); \ auto z7 = _mm256_mul_ps(s1, w2); \ auto z8 = _mm256_mul_ps(s2, w2); \ - auto w3 = _mm256_broadcast_ss(weight + 0 * 4 + 3); \ + auto w3 = BROAD_LOAD(weight + 0 * 4 + 3); \ auto z9 = _mm256_mul_ps(s0, w3); \ auto z10 = _mm256_mul_ps(s1, w3); \ auto z11 = _mm256_mul_ps(s2, w3); #define COMPUTE_24_4 \ - s0 = _mm256_loadu_ps(A + sy * 24); \ - s1 = _mm256_loadu_ps(A + sy * 24 + 8); \ - s2 = _mm256_loadu_ps(A + sy * 24 + 16); \ - w0 = _mm256_broadcast_ss(weight + sy * 4 + 0); \ + s0 = LOAD8(A + sy * 24); \ + s1 = LOAD8(A + sy * 24 + 8); \ + s2 = LOAD8(A + sy * 24 + 16); \ + w0 = BROAD_LOAD(weight + sy * 4 + 0); \ z0 = MNNAVXFMA(s0, w0, z0); \ z1 = MNNAVXFMA(s1, w0, z1); \ z2 = MNNAVXFMA(s2, w0, z2); \ - w1 = _mm256_broadcast_ss(weight + sy * 4 + 1); \ + w1 = BROAD_LOAD(weight + sy * 4 + 1); \ z3 = MNNAVXFMA(s0, w1, z3); \ z4 = MNNAVXFMA(s1, w1, z4); \ z5 = MNNAVXFMA(s2, w1, z5); \ - w2 = _mm256_broadcast_ss(weight + sy * 4 + 2); \ + w2 = BROAD_LOAD(weight + sy * 4 + 2); \ z6 = MNNAVXFMA(s0, w2, z6); \ z7 = MNNAVXFMA(s1, w2, z7); \ z8 = MNNAVXFMA(s2, w2, z8); \ - w3 = _mm256_broadcast_ss(weight + sy * 4 + 3); \ + w3 = BROAD_LOAD(weight + sy * 4 + 3); \ z9 = MNNAVXFMA(s0, w3, z9); \ z10 = MNNAVXFMA(s1, w3, z10); \ z11 = MNNAVXFMA(s2, w3, z11); -static void _AVX_MNNPackedMatMul_24(float* C, const float* A, const float* B, const size_t* parameter) { +template +static void _AVX_MNNPackedMatMul_24(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); for (int y = 0; y < hC4; ++y) { @@ -82,44 +95,116 @@ static void _AVX_MNNPackedMatMul_24(float* C, const float* A, const float* B, co } } -#define INIT_MAIN_16_4 \ - auto s0 = _mm256_loadu_ps(A + 0 * aStride); \ - auto s1 = _mm256_loadu_ps(A + 0 * aStride + 8); \ - auto w0 = _mm256_broadcast_ss(weight + 0 * 4 + 0); \ + +#define EXPAND_128(x) _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128((x)))) +// +#define INIT_MAIN_20_4 \ + auto s0 = LOAD8(A + 0 * aStride); \ + auto s1 = LOAD8(A + 0 * aStride + 8); \ + auto s2 = EXPAND_128(LOAD4(A + 0 * aStride + 16)); \ + auto w0 = BROAD_LOAD(weight + 0 * 4 + 0); \ auto z0 = _mm256_mul_ps(s0, w0); \ auto z1 = _mm256_mul_ps(s1, w0); \ - auto w1 = _mm256_broadcast_ss(weight + 0 * 4 + 1); \ + auto z2 = _mm256_mul_ps(s2, w0); \ + auto w1 = BROAD_LOAD(weight + 0 * 4 + 1); \ auto z3 = _mm256_mul_ps(s0, w1); \ auto z4 = _mm256_mul_ps(s1, w1); \ - auto w2 = _mm256_broadcast_ss(weight + 0 * 4 + 2); \ + auto z5 = _mm256_mul_ps(s2, w1); \ + auto w2 = BROAD_LOAD(weight + 0 * 4 + 2); \ auto z6 = _mm256_mul_ps(s0, w2); \ auto z7 = _mm256_mul_ps(s1, w2); \ - auto w3 = _mm256_broadcast_ss(weight + 0 * 4 + 3); \ + auto z8 = _mm256_mul_ps(s2, w2); \ + auto w3 = BROAD_LOAD(weight + 0 * 4 + 3); \ + auto z9 = _mm256_mul_ps(s0, w3); \ + auto z10 = _mm256_mul_ps(s1, w3); \ + auto z11 = _mm256_mul_ps(s2, w3); + +#define COMPUTE_20_4 \ + s0 = LOAD8(A + sy * aStride); \ + s1 = LOAD8(A + sy * aStride + 8); \ + s2 = EXPAND_128(LOAD4(A + sy * aStride + 16)); \ + w0 = BROAD_LOAD(weight + sy * 4 + 0); \ + z0 = MNNAVXFMA(s0, w0, z0); \ + z1 = MNNAVXFMA(s1, w0, z1); \ + z2 = MNNAVXFMA(s2, w0, z2); \ + w1 = BROAD_LOAD(weight + sy * 4 + 1); \ + z3 = MNNAVXFMA(s0, w1, z3); \ + z4 = MNNAVXFMA(s1, w1, z4); \ + z5 = MNNAVXFMA(s2, w1, z5); \ + w2 = BROAD_LOAD(weight + sy * 4 + 2); \ + z6 = MNNAVXFMA(s0, w2, z6); \ + z7 = MNNAVXFMA(s1, w2, z7); \ + z8 = MNNAVXFMA(s2, w2, z8); \ + w3 = BROAD_LOAD(weight + sy * 4 + 3); \ + z9 = MNNAVXFMA(s0, w3, z9); \ + z10 = MNNAVXFMA(s1, w3, z10); \ + z11 = MNNAVXFMA(s2, w3, z11); + + +template +static void _AVX_MNNPackedMatMul_20(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); + auto h = parameter[2]; + auto l = parameter[1]; + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); + auto bStride = bExtraStride + l * 4; + auto hC4 = UP_DIV(h, 4); + for (int y = 0; y < hC4; ++y) { + auto weight = B + y * bStride; + auto dst = C + y * cStride; + INIT_MAIN_20_4; + + for (int sy = 1; sy < l; ++sy) { + COMPUTE_20_4; + } + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } +} + +#define INIT_MAIN_16_4 \ + auto s0 = LOAD8(A + 0 * aStride); \ + auto s1 = LOAD8(A + 0 * aStride + 8); \ + auto w0 = BROAD_LOAD(weight + 0 * 4 + 0); \ + auto z0 = _mm256_mul_ps(s0, w0); \ + auto z1 = _mm256_mul_ps(s1, w0); \ + auto w1 = BROAD_LOAD(weight + 0 * 4 + 1); \ + auto z3 = _mm256_mul_ps(s0, w1); \ + auto z4 = _mm256_mul_ps(s1, w1); \ + auto w2 = BROAD_LOAD(weight + 0 * 4 + 2); \ + auto z6 = _mm256_mul_ps(s0, w2); \ + auto z7 = _mm256_mul_ps(s1, w2); \ + auto w3 = BROAD_LOAD(weight + 0 * 4 + 3); \ auto z9 = _mm256_mul_ps(s0, w3); \ auto z10 = _mm256_mul_ps(s1, w3); #define COMPUTE_16_4 \ - s0 = _mm256_loadu_ps(A + sy * aStride); \ - s1 = _mm256_loadu_ps(A + sy * aStride + 8); \ - w0 = _mm256_broadcast_ss(weight + sy * 4 + 0); \ + s0 = LOAD8(A + sy * aStride); \ + s1 = LOAD8(A + sy * aStride + 8); \ + w0 = BROAD_LOAD(weight + sy * 4 + 0); \ z0 = MNNAVXFMA(s0, w0, z0); \ z1 = MNNAVXFMA(s1, w0, z1); \ - w1 = _mm256_broadcast_ss(weight + sy * 4 + 1); \ + w1 = BROAD_LOAD(weight + sy * 4 + 1); \ z3 = MNNAVXFMA(s0, w1, z3); \ z4 = MNNAVXFMA(s1, w1, z4); \ - w2 = _mm256_broadcast_ss(weight + sy * 4 + 2); \ + w2 = BROAD_LOAD(weight + sy * 4 + 2); \ z6 = MNNAVXFMA(s0, w2, z6); \ z7 = MNNAVXFMA(s1, w2, z7); \ - w3 = _mm256_broadcast_ss(weight + sy * 4 + 3); \ + w3 = BROAD_LOAD(weight + sy * 4 + 3); \ z9 = MNNAVXFMA(s0, w3, z9); \ z10 = MNNAVXFMA(s1, w3, z10); -static void _AVX_MNNPackedMatMul_16(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); +template +static void _AVX_MNNPackedMatMul_16(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); for (int y = 0; y < hC4; ++y) { @@ -138,33 +223,34 @@ static void _AVX_MNNPackedMatMul_16(float* C, const float* A, const float* B, co } #define INIT_MAIN_8_4 \ - auto s0 = _mm256_loadu_ps(A + 0 * aStride); \ - auto w0 = _mm256_broadcast_ss(weight + 0 * 4 + 0); \ - auto w1 = _mm256_broadcast_ss(weight + 0 * 4 + 1); \ - auto w2 = _mm256_broadcast_ss(weight + 0 * 4 + 2); \ - auto w3 = _mm256_broadcast_ss(weight + 0 * 4 + 3); \ + auto s0 = LOAD8(A + 0 * aStride); \ + auto w0 = BROAD_LOAD(weight + 0 * 4 + 0); \ + auto w1 = BROAD_LOAD(weight + 0 * 4 + 1); \ + auto w2 = BROAD_LOAD(weight + 0 * 4 + 2); \ + auto w3 = BROAD_LOAD(weight + 0 * 4 + 3); \ auto z0 = _mm256_mul_ps(s0, w0); \ auto z3 = _mm256_mul_ps(s0, w1); \ auto z6 = _mm256_mul_ps(s0, w2); \ auto z9 = _mm256_mul_ps(s0, w3); #define COMPUTE_8_4 \ - s0 = _mm256_loadu_ps(A + sy * aStride); \ - w0 = _mm256_broadcast_ss(weight + sy * 4 + 0); \ - w1 = _mm256_broadcast_ss(weight + sy * 4 + 1); \ - w2 = _mm256_broadcast_ss(weight + sy * 4 + 2); \ - w3 = _mm256_broadcast_ss(weight + sy * 4 + 3); \ + s0 = LOAD8(A + sy * aStride); \ + w0 = BROAD_LOAD(weight + sy * 4 + 0); \ + w1 = BROAD_LOAD(weight + sy * 4 + 1); \ + w2 = BROAD_LOAD(weight + sy * 4 + 2); \ + w3 = BROAD_LOAD(weight + sy * 4 + 3); \ z0 = MNNAVXFMA(s0, w0, z0); \ z3 = MNNAVXFMA(s0, w1, z3); \ z6 = MNNAVXFMA(s0, w2, z6); \ z9 = MNNAVXFMA(s0, w3, z9); -static void _AVX_MNNPackedMatMul_8(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); +template +static void _AVX_MNNPackedMatMul_8(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); for (int y = 0; y < hC4; ++y) { @@ -179,12 +265,13 @@ static void _AVX_MNNPackedMatMul_8(float* C, const float* A, const float* B, con TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); } } -static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); +template +static void _AVX_MNNPackedMatMul_5(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; @@ -219,11 +306,11 @@ static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, con auto srcUse = src; for (int sy = 0; sy < l; ++sy) { - auto S0 = _mm256_broadcast_ss(srcUse + 0); - auto S1 = _mm256_broadcast_ss(srcUse + 1); - auto S2 = _mm256_broadcast_ss(srcUse + 2); - auto S3 = _mm256_broadcast_ss(srcUse + 3); - auto S4 = _mm256_broadcast_ss(srcUse + 4); + auto S0 = BROAD_LOAD(srcUse + 0); + auto S1 = BROAD_LOAD(srcUse + 1); + auto S2 = BROAD_LOAD(srcUse + 2); + auto S3 = BROAD_LOAD(srcUse + 3); + auto S4 = BROAD_LOAD(srcUse + 4); auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight0), mm_loadu_si128(weight1), 1)); auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight2), mm_loadu_si128(weight3), 1)); @@ -248,31 +335,31 @@ static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, con weight2 += 4; weight3 += 4; } - _mm256_storeu_ps(dst0 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 32)); - _mm256_storeu_ps(dst0 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 32)); - _mm_storeu_ps(dst0 + 16, _mm256_extractf128_ps(sumAvx40, 0)); + STORE_8(dst0 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 32)); + STORE_8(dst0 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 32)); + STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx40, 0)); - _mm256_storeu_ps(dst1 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 49)); - _mm256_storeu_ps(dst1 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 49)); - _mm_storeu_ps(dst1 + 16, _mm256_extractf128_ps(sumAvx40, 1)); + STORE_8(dst1 + 0, _mm256_permute2f128_ps(sumAvx00, sumAvx10, 49)); + STORE_8(dst1 + 8, _mm256_permute2f128_ps(sumAvx20, sumAvx30, 49)); + STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx40, 1)); - _mm256_storeu_ps(dst2 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 32)); - _mm256_storeu_ps(dst2 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 32)); - _mm_storeu_ps(dst2 + 16, _mm256_extractf128_ps(sumAvx41, 0)); + STORE_8(dst2 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 32)); + STORE_8(dst2 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 32)); + STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx41, 0)); - _mm256_storeu_ps(dst3 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 49)); - _mm256_storeu_ps(dst3 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 49)); - _mm_storeu_ps(dst3 + 16, _mm256_extractf128_ps(sumAvx41, 1)); + STORE_8(dst3 + 0, _mm256_permute2f128_ps(sumAvx01, sumAvx11, 49)); + STORE_8(dst3 + 8, _mm256_permute2f128_ps(sumAvx21, sumAvx31, 49)); + STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx41, 1)); } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; auto dst = C + y * cStride; - auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0); - auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1); - auto s2 = _mm_broadcast_ss(A + 0 * aStride + 2); - auto s3 = _mm_broadcast_ss(A + 0 * aStride + 3); - auto s4 = _mm_broadcast_ss(A + 0 * aStride + 4); - auto w0 = _mm_loadu_ps(weight + 0 * 4); + auto s0 = BROAD_LOAD_4(A + 0 * aStride + 0); + auto s1 = BROAD_LOAD_4(A + 0 * aStride + 1); + auto s2 = BROAD_LOAD_4(A + 0 * aStride + 2); + auto s3 = BROAD_LOAD_4(A + 0 * aStride + 3); + auto s4 = BROAD_LOAD_4(A + 0 * aStride + 4); + auto w0 = LOAD4(weight + 0 * 4); auto z0 = _mm_mul_ps(s0, w0); auto z1 = _mm_mul_ps(s1, w0); auto z2 = _mm_mul_ps(s2, w0); @@ -280,33 +367,33 @@ static void _AVX_MNNPackedMatMul_5(float* C, const float* A, const float* B, con auto z4 = _mm_mul_ps(s4, w0); for (int sy = 1; sy < l; ++sy) { - s0 = _mm_broadcast_ss(A + sy * aStride + 0); - s1 = _mm_broadcast_ss(A + sy * aStride + 1); - s2 = _mm_broadcast_ss(A + sy * aStride + 2); - s3 = _mm_broadcast_ss(A + sy * aStride + 3); - s4 = _mm_broadcast_ss(A + sy * aStride + 4); - w0 = _mm_loadu_ps(weight + sy * 4); + s0 = BROAD_LOAD_4(A + sy * aStride + 0); + s1 = BROAD_LOAD_4(A + sy * aStride + 1); + s2 = BROAD_LOAD_4(A + sy * aStride + 2); + s3 = BROAD_LOAD_4(A + sy * aStride + 3); + s4 = BROAD_LOAD_4(A + sy * aStride + 4); + w0 = LOAD4(weight + sy * 4); z0 = MNNSSEFMA(s0, w0, z0); z1 = MNNSSEFMA(s1, w0, z1); z2 = MNNSSEFMA(s2, w0, z2); z3 = MNNSSEFMA(s3, w0, z3); z4 = MNNSSEFMA(s4, w0, z4); } - _mm_store_ps(dst + 4 * 0, z0); - _mm_store_ps(dst + 4 * 1, z1); - _mm_store_ps(dst + 4 * 2, z2); - _mm_store_ps(dst + 4 * 3, z3); - _mm_store_ps(dst + 4 * 4, z4); + STORE_4(dst + 4 * 0, z0); + STORE_4(dst + 4 * 1, z1); + STORE_4(dst + 4 * 2, z2); + STORE_4(dst + 4 * 3, z3); + STORE_4(dst + 4 * 4, z4); } } - -static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); +template +static void _AVX_MNNPackedMatMul_3(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; @@ -335,9 +422,9 @@ static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, con auto srcUse = src; for (int sy = 0; sy < l; ++sy) { - auto S0 = _mm256_broadcast_ss(srcUse + 0); - auto S1 = _mm256_broadcast_ss(srcUse + 1); - auto S2 = _mm256_broadcast_ss(srcUse + 2); + auto S0 = BROAD_LOAD(srcUse + 0); + auto S1 = BROAD_LOAD(srcUse + 1); + auto S2 = BROAD_LOAD(srcUse + 2); auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight0), mm_loadu_si128(weight1), 1)); auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight2), mm_loadu_si128(weight3), 1)); @@ -356,55 +443,55 @@ static void _AVX_MNNPackedMatMul_3(float* C, const float* A, const float* B, con weight2 += 4; weight3 += 4; } - _mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); - _mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0)); - _mm_storeu_ps(dst0 + 8, _mm256_extractf128_ps(sumAvx20, 0)); + STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); + STORE_4(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0)); + STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx20, 0)); - _mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); - _mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1)); - _mm_storeu_ps(dst1 + 8, _mm256_extractf128_ps(sumAvx20, 1)); + STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); + STORE_4(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1)); + STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx20, 1)); - _mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); - _mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0)); - _mm_storeu_ps(dst2 + 8, _mm256_extractf128_ps(sumAvx21, 0)); + STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); + STORE_4(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0)); + STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx21, 0)); - _mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); - _mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1)); - _mm_storeu_ps(dst3 + 8, _mm256_extractf128_ps(sumAvx21, 1)); + STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); + STORE_4(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1)); + STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx21, 1)); } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; auto dst = C + y * cStride; - auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0); - auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1); - auto s2 = _mm_broadcast_ss(A + 0 * aStride + 2); - auto w0 = _mm_loadu_ps(weight + 0 * 4); + auto s0 = BROAD_LOAD_4(A + 0 * aStride + 0); + auto s1 = BROAD_LOAD_4(A + 0 * aStride + 1); + auto s2 = BROAD_LOAD_4(A + 0 * aStride + 2); + auto w0 = LOAD4(weight + 0 * 4); auto z0 = _mm_mul_ps(s0, w0); auto z1 = _mm_mul_ps(s1, w0); auto z2 = _mm_mul_ps(s2, w0); for (int sy = 1; sy < l; ++sy) { - s0 = _mm_broadcast_ss(A + sy * aStride + 0); - s1 = _mm_broadcast_ss(A + sy * aStride + 1); - s2 = _mm_broadcast_ss(A + sy * aStride + 2); - w0 = _mm_loadu_ps(weight + sy * 4); + s0 = BROAD_LOAD_4(A + sy * aStride + 0); + s1 = BROAD_LOAD_4(A + sy * aStride + 1); + s2 = BROAD_LOAD_4(A + sy * aStride + 2); + w0 = LOAD4(weight + sy * 4); z0 = MNNSSEFMA(s0, w0, z0); z1 = MNNSSEFMA(s1, w0, z1); z2 = MNNSSEFMA(s2, w0, z2); } - _mm_store_ps(dst + 4 * 0, z0); - _mm_store_ps(dst + 4 * 1, z1); - _mm_store_ps(dst + 4 * 2, z2); + STORE_4(dst + 4 * 0, z0); + STORE_4(dst + 4 * 1, z1); + STORE_4(dst + 4 * 2, z2); } } - -static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); +template +static void _AVX_MNNPackedMatMul_2(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; @@ -430,8 +517,8 @@ static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, con auto srcUse = src; for (int sy = 0; sy < l; ++sy) { - auto S0 = _mm256_broadcast_ss(srcUse + 0); - auto S1 = _mm256_broadcast_ss(srcUse + 1); + auto S0 = BROAD_LOAD(srcUse + 0); + auto S1 = BROAD_LOAD(srcUse + 1); auto W0 = _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight0), mm_loadu_si128(weight1), 1)); auto W1 = _mm256_castsi256_ps(_mm256_insertf128_si256(mm256_broadcastsi128_si256(weight2), mm_loadu_si128(weight3), 1)); @@ -447,46 +534,47 @@ static void _AVX_MNNPackedMatMul_2(float* C, const float* A, const float* B, con weight2 += 4; weight3 += 4; } - _mm_storeu_ps(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); - _mm_storeu_ps(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0)); + STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); + STORE_4(dst0 + 4, _mm256_extractf128_ps(sumAvx10, 0)); - _mm_storeu_ps(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); - _mm_storeu_ps(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1)); + STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); + STORE_4(dst1 + 4, _mm256_extractf128_ps(sumAvx10, 1)); - _mm_storeu_ps(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); - _mm_storeu_ps(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0)); + STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); + STORE_4(dst2 + 4, _mm256_extractf128_ps(sumAvx11, 0)); - _mm_storeu_ps(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); - _mm_storeu_ps(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1)); + STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); + STORE_4(dst3 + 4, _mm256_extractf128_ps(sumAvx11, 1)); } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; auto dst = C + y * cStride; - auto s0 = _mm_broadcast_ss(A + 0 * aStride + 0); - auto s1 = _mm_broadcast_ss(A + 0 * aStride + 1); - auto w0 = _mm_loadu_ps(weight + 0 * 4); + auto s0 = BROAD_LOAD_4(A + 0 * aStride + 0); + auto s1 = BROAD_LOAD_4(A + 0 * aStride + 1); + auto w0 = LOAD4(weight + 0 * 4); auto z0 = _mm_mul_ps(s0, w0); auto z1 = _mm_mul_ps(s1, w0); for (int sy = 1; sy < l; ++sy) { - s0 = _mm_broadcast_ss(A + sy * aStride + 0); - s1 = _mm_broadcast_ss(A + sy * aStride + 1); - w0 = _mm_loadu_ps(weight + sy * 4); + s0 = BROAD_LOAD_4(A + sy * aStride + 0); + s1 = BROAD_LOAD_4(A + sy * aStride + 1); + w0 = LOAD4(weight + sy * 4); z0 = MNNSSEFMA(s0, w0, z0); z1 = MNNSSEFMA(s1, w0, z1); } - _mm_store_ps(dst + 4 * 0, z0); - _mm_store_ps(dst + 4 * 1, z1); + STORE_4(dst + 4 * 0, z0); + STORE_4(dst + 4 * 1, z1); } } -static void _AVX_MNNPackedMatMul_4(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); +template +static void _AVX_MNNPackedMatMul_4(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; @@ -519,11 +607,11 @@ static void _AVX_MNNPackedMatMul_4(float* C, const float* A, const float* B, con auto srcUse = src; for (int sy = 0; sy < l; ++sy) { #define LOAD_S_4(i) \ -auto s##i##0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (i) * aStride + 0));\ -auto s##i##1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (i) * aStride + 1));\ +auto s##i##0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (i) * aStride + 0));\ +auto s##i##1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (i) * aStride + 1));\ auto S##i##0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s##i##0, s##i##1, 1));\ -s##i##0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (i) * aStride + 2));\ -s##i##1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (i) * aStride + 3));\ +s##i##0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (i) * aStride + 2));\ +s##i##1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (i) * aStride + 3));\ auto S##i##1 = _mm256_castsi256_ps(_mm256_insertf128_si256(s##i##0, s##i##1, 1));\ LOAD_S_4(0); @@ -550,90 +638,96 @@ auto S##i##1 = _mm256_castsi256_ps(_mm256_insertf128_si256(s##i##0, s##i##1, 1)) weight2 += 4; weight3 += 4; } - _mm256_storeu_ps(dst0, sumAvx00); - _mm256_storeu_ps(dst0 + 8, sumAvx01); - _mm256_storeu_ps(dst1, sumAvx10); - _mm256_storeu_ps(dst1 + 8, sumAvx11); - _mm256_storeu_ps(dst2, sumAvx20); - _mm256_storeu_ps(dst2 + 8, sumAvx21); - _mm256_storeu_ps(dst3, sumAvx30); - _mm256_storeu_ps(dst3 + 8, sumAvx31); + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx01); + STORE_8(dst1, sumAvx10); + STORE_8(dst1 + 8, sumAvx11); + STORE_8(dst2, sumAvx20); + STORE_8(dst2 + 8, sumAvx21); + STORE_8(dst3, sumAvx30); + STORE_8(dst3 + 8, sumAvx31); } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; auto dst = C + y * cStride; - auto s0 = _mm_loadu_ps(A + 0 * aStride); - auto w0 = _mm_broadcast_ss(weight + 0 * 4 + 0); - auto w1 = _mm_broadcast_ss(weight + 0 * 4 + 1); - auto w2 = _mm_broadcast_ss(weight + 0 * 4 + 2); - auto w3 = _mm_broadcast_ss(weight + 0 * 4 + 3); + auto s0 = LOAD4(A + 0 * aStride); + auto w0 = BROAD_LOAD_4(weight + 0 * 4 + 0); + auto w1 = BROAD_LOAD_4(weight + 0 * 4 + 1); + auto w2 = BROAD_LOAD_4(weight + 0 * 4 + 2); + auto w3 = BROAD_LOAD_4(weight + 0 * 4 + 3); auto z0 = _mm_mul_ps(s0, w0); auto z3 = _mm_mul_ps(s0, w1); auto z6 = _mm_mul_ps(s0, w2); auto z9 = _mm_mul_ps(s0, w3); for (int sy = 1; sy < l; ++sy) { - s0 = _mm_loadu_ps(A + sy * aStride); - w0 = _mm_broadcast_ss(weight + sy * 4 + 0); - w1 = _mm_broadcast_ss(weight + sy * 4 + 1); - w2 = _mm_broadcast_ss(weight + sy * 4 + 2); - w3 = _mm_broadcast_ss(weight + sy * 4 + 3); + s0 = LOAD4(A + sy * aStride); + w0 = BROAD_LOAD_4(weight + sy * 4 + 0); + w1 = BROAD_LOAD_4(weight + sy * 4 + 1); + w2 = BROAD_LOAD_4(weight + sy * 4 + 2); + w3 = BROAD_LOAD_4(weight + sy * 4 + 3); z0 = MNNSSEFMA(s0, w0, z0); z3 = MNNSSEFMA(s0, w1, z3); z6 = MNNSSEFMA(s0, w2, z6); z9 = MNNSSEFMA(s0, w3, z9); } _MM_TRANSPOSE4_PS(z0, z3, z6, z9); - _mm_store_ps(dst + 4 * 0, z0); - _mm_store_ps(dst + 4 * 1, z3); - _mm_store_ps(dst + 4 * 2, z6); - _mm_store_ps(dst + 4 * 3, z9); + STORE_4(dst + 4 * 0, z0); + STORE_4(dst + 4 * 1, z3); + STORE_4(dst + 4 * 2, z6); + STORE_4(dst + 4 * 3, z9); } } -static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const float* B, size_t eSize, - const size_t* parameter, float* cache, const float* postParameters, - const float* bias) { +template +static void _AVX_MNNPackednMatMulRemainCommon(TYPE* C, const TYPE* A, const TYPE* B, size_t eSize, + const size_t* parameter) { auto h = parameter[2]; auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); auto bStride = bExtraStride + l * 4; auto hC4 = UP_DIV(h, 4); auto es = eSize; auto oC = C; - auto aStride = parameter[0] / sizeof(float); + auto aStride = parameter[0] / sizeof(TYPE); + if (eSize >= 20) { + _AVX_MNNPackedMatMul_20(C, A, B, parameter); + eSize -= 20; + C += 20 * 4; + A += 20; + } if (eSize >= 16) { - _AVX_MNNPackedMatMul_16(C, A, B, parameter); + _AVX_MNNPackedMatMul_16(C, A, B, parameter); eSize -= 16; C += 16 * 4; A += 16; } if (eSize >= 8) { - _AVX_MNNPackedMatMul_8(C, A, B, parameter); + _AVX_MNNPackedMatMul_8(C, A, B, parameter); eSize -= 8; C += 8 * 4; A += 8; } if (eSize >= 5) { - _AVX_MNNPackedMatMul_5(C, A, B, parameter); + _AVX_MNNPackedMatMul_5(C, A, B, parameter); eSize -= 5; C += 5 * 4; A += 5; } if (eSize == 4) { - _AVX_MNNPackedMatMul_4(C, A, B, parameter); + _AVX_MNNPackedMatMul_4(C, A, B, parameter); eSize -= 4; C += 4 * 4; A += 4; } if (eSize == 3) { - _AVX_MNNPackedMatMul_3(C, A, B, parameter); + _AVX_MNNPackedMatMul_3(C, A, B, parameter); eSize -= 3; C += 3 * 4; A += 3; } if (eSize == 2) { - _AVX_MNNPackedMatMul_2(C, A, B, parameter); + _AVX_MNNPackedMatMul_2(C, A, B, parameter); eSize -= 2; C += 2 * 4; A += 2; @@ -671,21 +765,21 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl auto srcUse = src; for (int sy = 0; sy < lC4; ++sy) { - auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride)); - auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride)); + auto s0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (0) * aStride)); + auto s1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (1) * aStride)); auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1)); - auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride)); - auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride)); + auto d0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (2) * aStride)); + auto d1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (3) * aStride)); auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1)); - auto W00 = _mm256_loadu_ps(weight0 + 16 * sy + 0); - auto W01 = _mm256_loadu_ps(weight0 + 16 * sy + 8); - auto W10 = _mm256_loadu_ps(weight1 + 16 * sy + 0); - auto W11 = _mm256_loadu_ps(weight1 + 16 * sy + 8); + auto W00 = LOAD8(weight0 + 16 * sy + 0); + auto W01 = LOAD8(weight0 + 16 * sy + 8); + auto W10 = LOAD8(weight1 + 16 * sy + 0); + auto W11 = LOAD8(weight1 + 16 * sy + 8); - auto W20 = _mm256_loadu_ps(weight2 + 16 * sy + 0); - auto W21 = _mm256_loadu_ps(weight2 + 16 * sy + 8); - auto W30 = _mm256_loadu_ps(weight3 + 16 * sy + 0); - auto W31 = _mm256_loadu_ps(weight3 + 16 * sy + 8); + auto W20 = LOAD8(weight2 + 16 * sy + 0); + auto W21 = LOAD8(weight2 + 16 * sy + 8); + auto W30 = LOAD8(weight3 + 16 * sy + 0); + auto W31 = LOAD8(weight3 + 16 * sy + 8); sumAvx00 = MNNAVXFMA(S0, W00, sumAvx00); sumAvx01 = MNNAVXFMA(S1, W01, sumAvx01); @@ -718,21 +812,21 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl auto sum31 = _mm256_extractf128_ps(sumAvx30, 1); auto sum3 = _mm_add_ps(sum30, sum31); for (int sy = lR; sy < l; ++sy) { - auto s = _mm_broadcast_ss(srcUse); - auto w0 = _mm_loadu_ps(weight0 + 4 * sy); - auto w1 = _mm_loadu_ps(weight1 + 4 * sy); - auto w2 = _mm_loadu_ps(weight2 + 4 * sy); - auto w3 = _mm_loadu_ps(weight3 + 4 * sy); + auto s = BROAD_LOAD_4(srcUse); + auto w0 = LOAD4(weight0 + 4 * sy); + auto w1 = LOAD4(weight1 + 4 * sy); + auto w2 = LOAD4(weight2 + 4 * sy); + auto w3 = LOAD4(weight3 + 4 * sy); sum0 = MNNSSEFMA(s, w0, sum0); sum1 = MNNSSEFMA(s, w1, sum1); sum2 = MNNSSEFMA(s, w2, sum2); sum3 = MNNSSEFMA(s, w3, sum3); srcUse += aStride; } - _mm_store_ps(dst0, sum0); - _mm_store_ps(dst1, sum1); - _mm_store_ps(dst2, sum2); - _mm_store_ps(dst3, sum3); + STORE_4(dst0, sum0); + STORE_4(dst1, sum1); + STORE_4(dst2, sum2); + STORE_4(dst3, sum3); } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; @@ -741,14 +835,14 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl auto sumAvx1 = _mm256_set1_ps(0.0f); auto srcUse = src; for (int sy = 0; sy < lC4; ++sy) { - auto s0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (0) * aStride)); - auto s1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (1) * aStride)); + auto s0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (0) * aStride)); + auto s1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (1) * aStride)); auto S0 = _mm256_castsi256_ps(_mm256_insertf128_si256(s0, s1, 1)); - auto d0 = _mm256_castps_si256(_mm256_broadcast_ss(srcUse + (2) * aStride)); - auto d1 = _mm_castps_si128(_mm_broadcast_ss(srcUse + (3) * aStride)); + auto d0 = _mm256_castps_si256(BROAD_LOAD(srcUse + (2) * aStride)); + auto d1 = _mm_castps_si128(BROAD_LOAD_4(srcUse + (3) * aStride)); auto S1 = _mm256_castsi256_ps(_mm256_insertf128_si256(d0, d1, 1)); - auto W0 = _mm256_loadu_ps(weight + 16 * sy + 0); - auto W1 = _mm256_loadu_ps(weight + 16 * sy + 8); + auto W0 = LOAD8(weight + 16 * sy + 0); + auto W1 = LOAD8(weight + 16 * sy + 8); sumAvx0 = MNNAVXFMA(S0, W0, sumAvx0); sumAvx1 = MNNAVXFMA(S1, W1, sumAvx1); srcUse += 4 * aStride; @@ -758,11 +852,11 @@ static void _AVX_MNNPackednMatMulRemainCommon(float* C, const float* A, const fl auto sum1 = _mm256_extractf128_ps(sumAvx0, 1); auto sum = _mm_add_ps(sum0, sum1); for (int sy = lR; sy < l; ++sy) { - auto s = _mm_broadcast_ss(srcUse); - auto w = _mm_loadu_ps(weight + 4 * sy); + auto s = BROAD_LOAD_4(srcUse); + auto w = LOAD4(weight + 4 * sy); sum = MNNSSEFMA(s, w, sum); srcUse += aStride; } - _mm_store_ps(dst, sum); + STORE_4(dst, sum); } } diff --git a/source/backend/cpu/x86_x64/avx/GemmFunctionPackL.hpp b/source/backend/cpu/x86_x64/avx/GemmFunctionPackL.hpp new file mode 100644 index 00000000..409fea53 --- /dev/null +++ b/source/backend/cpu/x86_x64/avx/GemmFunctionPackL.hpp @@ -0,0 +1,189 @@ +// +// GemmFunctionPackL.hpp +// MNN +// +// Created by MNN on b'2021/02/01'. +// Copyright © 2018, Alibaba Group Holding Limited + +namespace { +static inline __m128i mm_loadu_si128(const void* addr) { + return _mm_castps_si128(LOAD4((const float*)addr)); +} + +static inline __m256i mm256_broadcastsi128_si256(const void* addr) { + return _mm256_broadcastsi128_si256(mm_loadu_si128(addr)); +} +} // namespace +// + +template +static void _AVX_MNNPackedMatMul_3(TYPE* C, const TYPE* A, const TYPE* B, const size_t* parameter) { + auto aStride = 3; + auto h = parameter[2]; + auto l = parameter[1]; + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); + auto lC8 = UP_DIV(l, 8); + auto hC4 = UP_DIV(h, 4); + const int hC4Unit = 4; + auto src = A; + __m256 temp; + for (int y = 0; y < hC4; ++y) { + auto S00 = _mm256_xor_ps(temp, temp); + auto S01 = _mm256_xor_ps(temp, temp); + auto S02 = _mm256_xor_ps(temp, temp); + auto S03 = _mm256_xor_ps(temp, temp); + + auto S10 = _mm256_xor_ps(temp, temp); + auto S11 = _mm256_xor_ps(temp, temp); + auto S12 = _mm256_xor_ps(temp, temp); + auto S13 = _mm256_xor_ps(temp, temp); + + auto S20 = _mm256_xor_ps(temp, temp); + auto S21 = _mm256_xor_ps(temp, temp); + auto S22 = _mm256_xor_ps(temp, temp); + auto S23 = _mm256_xor_ps(temp, temp); + + auto srcUse = src; + for (int sy = 0; sy < lC8; ++sy) { + auto s0 = LOAD8(srcUse + 0 * 8); + auto s1 = LOAD8(srcUse + 1 * 8); + auto s2 = LOAD8(srcUse + 2 * 8); + temp = LOAD8(B + 0); + S00 = MNNAVXFMA(s0, temp, S00); + S10 = MNNAVXFMA(s1, temp, S10); + S20 = MNNAVXFMA(s2, temp, S20); + temp = LOAD8(B + 8); + S01 = MNNAVXFMA(s0, temp, S01); + S11 = MNNAVXFMA(s1, temp, S11); + S21 = MNNAVXFMA(s2, temp, S21); + temp = LOAD8(B + 16); + S02 = MNNAVXFMA(s0, temp, S02); + S12 = MNNAVXFMA(s1, temp, S12); + S22 = MNNAVXFMA(s2, temp, S22); + temp = LOAD8(B + 24); + S03 = MNNAVXFMA(s0, temp, S03); + S13 = MNNAVXFMA(s1, temp, S13); + S23 = MNNAVXFMA(s2, temp, S23); + + B+=32; + srcUse += aStride * 8; + } + + // Hadd + S00 = _mm256_hadd_ps(S00, S01); + S02 = _mm256_hadd_ps(S02, S03); + S00 = _mm256_hadd_ps(S00, S02); + STORE_4(C + 0, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1))); + + S10 = _mm256_hadd_ps(S10, S11); + S12 = _mm256_hadd_ps(S12, S13); + S00 = _mm256_hadd_ps(S10, S12); + STORE_4(C + 4, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1))); + + S20 = _mm256_hadd_ps(S20, S21); + S22 = _mm256_hadd_ps(S22, S23); + S00 = _mm256_hadd_ps(S20, S22); + STORE_4(C + 8, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1))); + + B+=bExtraStride; + C+=cStride; + } +} + + +template +static void _AVX_MNNPackednMatMulRemainCommon(TYPE* C, const TYPE* A, const TYPE* B, size_t eSize, + const size_t* parameter) { + auto aStride = parameter[0] / sizeof(TYPE); + auto h = parameter[2]; + auto l = parameter[1]; + auto cStride = parameter[3] / sizeof(TYPE); + auto bExtraStride = parameter[5] / sizeof(TYPE); + auto lC8 = UP_DIV(l, 8); + auto hC4 = UP_DIV(h, 4); + const int hC4Unit = 4; + auto src = A; + __m256 temp; + if (eSize == 2) { + for (int y = 0; y < hC4; ++y) { + auto S00 = _mm256_xor_ps(temp, temp); + auto S01 = _mm256_xor_ps(temp, temp); + auto S02 = _mm256_xor_ps(temp, temp); + auto S03 = _mm256_xor_ps(temp, temp); + + auto S10 = _mm256_xor_ps(temp, temp); + auto S11 = _mm256_xor_ps(temp, temp); + auto S12 = _mm256_xor_ps(temp, temp); + auto S13 = _mm256_xor_ps(temp, temp); + + auto srcUse = src; + for (int sy = 0; sy < lC8; ++sy) { + auto s0 = LOAD8(srcUse + 0 * 8); + auto s1 = LOAD8(srcUse + 1 * 8); + temp = LOAD8(B + 0); + S00 = MNNAVXFMA(s0, temp, S00); + S10 = MNNAVXFMA(s1, temp, S10); + temp = LOAD8(B + 8); + S01 = MNNAVXFMA(s0, temp, S01); + S11 = MNNAVXFMA(s1, temp, S11); + temp = LOAD8(B + 16); + S02 = MNNAVXFMA(s0, temp, S02); + S12 = MNNAVXFMA(s1, temp, S12); + temp = LOAD8(B + 24); + S03 = MNNAVXFMA(s0, temp, S03); + S13 = MNNAVXFMA(s1, temp, S13); + + B+=32; + srcUse += aStride * 8; + } + + // Hadd + S00 = _mm256_hadd_ps(S00, S01); + S02 = _mm256_hadd_ps(S02, S03); + S00 = _mm256_hadd_ps(S00, S02); + STORE_4(C + 0, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1))); + + S10 = _mm256_hadd_ps(S10, S11); + S12 = _mm256_hadd_ps(S12, S13); + S00 = _mm256_hadd_ps(S10, S12); + STORE_4(C + 4, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1))); + + B+=bExtraStride; + C+=cStride; + } + } + if (eSize == 1) { + for (int y = 0; y < hC4; ++y) { + auto S00 = _mm256_xor_ps(temp, temp); + auto S01 = _mm256_xor_ps(temp, temp); + auto S02 = _mm256_xor_ps(temp, temp); + auto S03 = _mm256_xor_ps(temp, temp); + + auto srcUse = src; + for (int sy = 0; sy < lC8; ++sy) { + auto s0 = LOAD8(srcUse + 0 * 8); + temp = LOAD8(B + 0); + S00 = MNNAVXFMA(s0, temp, S00); + temp = LOAD8(B + 8); + S01 = MNNAVXFMA(s0, temp, S01); + temp = LOAD8(B + 16); + S02 = MNNAVXFMA(s0, temp, S02); + temp = LOAD8(B + 24); + S03 = MNNAVXFMA(s0, temp, S03); + + B+=32; + srcUse += aStride * 8; + } + + // Hadd + S00 = _mm256_hadd_ps(S00, S01); + S02 = _mm256_hadd_ps(S02, S03); + S00 = _mm256_hadd_ps(S00, S02); + STORE_4(C + 0, _mm_add_ps(_mm256_extractf128_ps(S00, 0), _mm256_extractf128_ps(S00, 1))); + + B+=bExtraStride; + C+=cStride; + } + } +} diff --git a/source/backend/cpu/x86_x64/avx/MNNGemmFloatCommon_4.cpp b/source/backend/cpu/x86_x64/avx/MNNGemmFloatCommon_4.cpp deleted file mode 100644 index 3c7cb656..00000000 --- a/source/backend/cpu/x86_x64/avx/MNNGemmFloatCommon_4.cpp +++ /dev/null @@ -1,237 +0,0 @@ -// -// MNNGemmFloatCommon_4.cpp -// MNN -// -// Created by MNN on 2019/08/25. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include -#include -#include "FunctionSummary.hpp" -#include "backend/cpu/compute/Int8FunctionsOpt.h" - -#ifndef _MM_TRANSPOSE4_PS -#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - __m128 tmp3, tmp2, tmp1, tmp0; \ - tmp0 = _mm_unpacklo_ps((row0), (row1)); \ - tmp2 = _mm_unpacklo_ps((row2), (row3)); \ - tmp1 = _mm_unpackhi_ps((row0), (row1)); \ - tmp3 = _mm_unpackhi_ps((row2), (row3)); \ - (row0) = _mm_movelh_ps(tmp0, tmp2); \ - (row1) = _mm_movehl_ps(tmp2, tmp0); \ - (row2) = _mm_movelh_ps(tmp1, tmp3); \ - (row3) = _mm_movehl_ps(tmp3, tmp1); \ - } while (0) -#endif - -#ifdef MNN_VEC_PRINT -#include -static void _dump(__m256 v0) { - float fv0[8]; - _mm256_store_ps(fv0, v0); - for (int i = 0; i < 8; ++i) { - MNN_PRINT("%f, ", fv0[i]); - } - MNN_PRINT("\n"); -} -#endif -static __m256 _merge(__m256 v0, __m256 v1, __m256 v2, __m256 v3) { - auto h0 = _mm256_hadd_ps(v0, v1); - auto h1 = _mm256_hadd_ps(v2, v3); - auto res = _mm256_hadd_ps(h0, h1); - return res; -} - -static __m128 merge128(__m128 d0, __m128 d1, __m128 d2, __m128 d3) { - auto d00 = _mm_hadd_ps(d0, d1); - auto d01 = _mm_hadd_ps(d2, d3); - return _mm_hadd_ps(d00, d01); -} - -void _AVX_MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, - size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) { - auto src_depth_step = 4 * width; - const int unit = 4; - int wUnit = width / unit; - auto wUnitEnd = wUnit * unit; - for (int dz = 0; dz < dst_depth_quad; ++dz) { - float* dst_z = dst + dz * dst_step; - auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset); - - for (int dx = 0; dx < wUnit; ++dx) { - float* dst_x = dst_z + dx * 4 * unit; - const float* src_dx = src + dx * 4 * unit; - - auto is0 = _mm256_loadu_ps(src_dx + 8 * 0); - auto is1 = _mm256_loadu_ps(src_dx + 8 * 1); - - auto iw0 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 0)); - auto iw1 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 1)); - auto iw2 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 2)); - auto iw3 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 3)); - -#define MNN_INIT_VEC(i, j) auto d##i##j = _mm256_mul_ps(is##i, iw##j) - MNN_INIT_VEC(0, 0); - MNN_INIT_VEC(0, 1); - MNN_INIT_VEC(0, 2); - MNN_INIT_VEC(0, 3); - MNN_INIT_VEC(1, 0); - MNN_INIT_VEC(1, 1); - MNN_INIT_VEC(1, 2); - MNN_INIT_VEC(1, 3); -#undef MNN_INIT_VEC - for (int sz = 1; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - auto s0 = _mm256_loadu_ps(src_z + 8 * 0); - auto s1 = _mm256_loadu_ps(src_z + 8 * 1); - - const float* weight_z = weight_dz + sz * 16; - auto w0 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 0)); - auto w1 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 1)); - auto w2 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 2)); - auto w3 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 3)); -#define COMPUTE(i, j) d##i##j = _mm256_add_ps(_mm256_mul_ps(s##i, w##j), d##i##j) - COMPUTE(0, 0); - COMPUTE(0, 1); - COMPUTE(0, 2); - COMPUTE(0, 3); - - COMPUTE(1, 0); - COMPUTE(1, 1); - COMPUTE(1, 2); - COMPUTE(1, 3); - -#undef COMPUTE - } - - _mm256_storeu_ps(dst_x + 8 * 0, _merge(d00, d01, d02, d03)); - _mm256_storeu_ps(dst_x + 8 * 1, _merge(d10, d11, d12, d13)); - } - for (int dx = wUnitEnd; dx < width; ++dx) { - float* dst_x = dst_z + dx * 4; - auto d0 = _mm_set1_ps(0.0f); - auto d1 = _mm_set1_ps(0.0f); - auto d2 = _mm_set1_ps(0.0f); - auto d3 = _mm_set1_ps(0.0f); - - const float* src_dx = src + 4 * dx; - for (int sz = 0; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - const float* weight_z = weight_dz + sz * 16; - auto w0 = _mm_loadu_ps(weight_z + 4 * 0); - auto w1 = _mm_loadu_ps(weight_z + 4 * 1); - auto w2 = _mm_loadu_ps(weight_z + 4 * 2); - auto w3 = _mm_loadu_ps(weight_z + 4 * 3); - auto s = _mm_loadu_ps(src_z); -#define COMPUTE(i) d##i = _mm_add_ps(_mm_mul_ps(s, w##i), d##i) - COMPUTE(0); - COMPUTE(1); - COMPUTE(2); - COMPUTE(3); -#undef COMPUTE - } - _mm_storeu_ps(dst_x, merge128(d0, d1, d2, d3)); - } - } -} - -void _AVX_MNNGemmFloatCommonFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, - size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) { - auto src_depth_step = 4 * width; - const int unit = 4; - int wUnit = width / unit; - auto wUnitEnd = wUnit * unit; - for (int dz = 0; dz < dst_depth_quad; ++dz) { - float* dst_z = dst + dz * dst_step; - auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset); - - for (int dx = 0; dx < wUnit; ++dx) { - float* dst_x = dst_z + dx * 4 * unit; - const float* src_dx = src + dx * 4 * unit; - - auto is0 = _mm256_loadu_ps(src_dx + 8 * 0); - auto is1 = _mm256_loadu_ps(src_dx + 8 * 1); - - auto iw0 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 0)); - auto iw1 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 1)); - auto iw2 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 2)); - auto iw3 = _mm256_broadcast_ps((const __m128*)(weight_dz + 4 * 3)); - -#define MNN_INIT_VEC(i, j) auto d##i##j = _mm256_mul_ps(is##i, iw##j) - MNN_INIT_VEC(0, 0); - MNN_INIT_VEC(0, 1); - MNN_INIT_VEC(0, 2); - MNN_INIT_VEC(0, 3); - MNN_INIT_VEC(1, 0); - MNN_INIT_VEC(1, 1); - MNN_INIT_VEC(1, 2); - MNN_INIT_VEC(1, 3); -#undef MNN_INIT_VEC - for (int sz = 1; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - auto s0 = _mm256_loadu_ps(src_z + 8 * 0); - auto s1 = _mm256_loadu_ps(src_z + 8 * 1); - - const float* weight_z = weight_dz + sz * 16; - auto w0 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 0)); - auto w1 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 1)); - auto w2 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 2)); - auto w3 = _mm256_broadcast_ps((const __m128*)(weight_z + 4 * 3)); -#define COMPUTE(i, j) d##i##j = _mm256_fmadd_ps(s##i, w##j, d##i##j) - COMPUTE(0, 0); - COMPUTE(0, 1); - COMPUTE(0, 2); - COMPUTE(0, 3); - - COMPUTE(1, 0); - COMPUTE(1, 1); - COMPUTE(1, 2); - COMPUTE(1, 3); - -#undef COMPUTE - } - - _mm256_storeu_ps(dst_x + 8 * 0, _merge(d00, d01, d02, d03)); - _mm256_storeu_ps(dst_x + 8 * 1, _merge(d10, d11, d12, d13)); - } - for (int dx = wUnitEnd; dx < width; ++dx) { - float* dst_x = dst_z + dx * 4; - auto d0 = _mm_set1_ps(0.0f); - auto d1 = _mm_set1_ps(0.0f); - auto d2 = _mm_set1_ps(0.0f); - auto d3 = _mm_set1_ps(0.0f); - - const float* src_dx = src + 4 * dx; - for (int sz = 0; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - const float* weight_z = weight_dz + sz * 16; - auto w0 = _mm_loadu_ps(weight_z + 4 * 0); - auto w1 = _mm_loadu_ps(weight_z + 4 * 1); - auto w2 = _mm_loadu_ps(weight_z + 4 * 2); - auto w3 = _mm_loadu_ps(weight_z + 4 * 3); - auto s = _mm_loadu_ps(src_z); -#define COMPUTE(i) d##i = _mm_fmadd_ps(s, w##i, d##i) - COMPUTE(0); - COMPUTE(1); - COMPUTE(2); - COMPUTE(3); -#undef COMPUTE - } - _mm_storeu_ps(dst_x, merge128(d0, d1, d2, d3)); - } - } -} - -void _AVX_MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t weight_depth_offset) { - return _AVX_MNNGemmFloatCommon_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, 8, - weight_depth_offset); -} - -void _AVX_MNNGemmFloatUnitFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, - size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset) { - return _AVX_MNNGemmFloatCommonFMA_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, 8, - weight_depth_offset); -} diff --git a/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp b/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp index e37e517d..2b464f7b 100644 --- a/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp +++ b/source/backend/cpu/x86_x64/avx512/CommonOptFunction.cpp @@ -1,323 +1,11 @@ #include "FunctionSummary.hpp" +#include "Gemm24_4_4.hpp" #include "core/Macro.h" #include "math/Vec.hpp" #include #include #include #include -#ifdef MNN_X86_USE_ASM -extern "C" { -void _AVX512_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4); -} -#endif - -using Vec8 = MNN::Math::Vec; -#define MNNAVXFMA _mm256_fmadd_ps -#define MNNAVX512FMA _mm512_fmadd_ps -#define MNNSSEFMA _mm_fmadd_ps - -#define AVX512_TRANSPOSE_SAVE(u, v, z0, z3, z6, z9, z12, z15, z18, z21) \ - { \ - auto m0 = _mm512_extractf32x4_ps(z0, u); \ - auto m1 = _mm512_extractf32x4_ps(z3, u); \ - auto m2 = _mm512_extractf32x4_ps(z6, u); \ - auto m3 = _mm512_extractf32x4_ps(z9, u); \ - auto m4 = _mm512_extractf32x4_ps(z12, u); \ - auto m5 = _mm512_extractf32x4_ps(z15, u); \ - auto m6 = _mm512_extractf32x4_ps(z18, u); \ - auto m7 = _mm512_extractf32x4_ps(z21, u); \ - _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ - _MM_TRANSPOSE4_PS(m4, m5, m6, m7); \ - _mm_storeu_ps(dst + 4 * (0 + 4 * u + 16 * v), m0); \ - _mm_storeu_ps(dst + 4 * (1 + 4 * u + 16 * v), m1); \ - _mm_storeu_ps(dst + 4 * (2 + 4 * u + 16 * v), m2); \ - _mm_storeu_ps(dst + 4 * (3 + 4 * u + 16 * v), m3); \ - _mm_storeu_ps(dst + cStride + 4 * (0 + 4 * u + 16 * v), m4); \ - _mm_storeu_ps(dst + cStride + 4 * (1 + 4 * u + 16 * v), m5); \ - _mm_storeu_ps(dst + cStride + 4 * (2 + 4 * u + 16 * v), m6); \ - _mm_storeu_ps(dst + cStride + 4 * (3 + 4 * u + 16 * v), m7); \ - } - -#define AVX512_TRANSPOSE_SAVE_HALF(u, v, z0, z3, z6, z9, z12, z15, z18, z21) \ - { \ - auto m0 = _mm512_extractf32x4_ps(z0, u); \ - auto m1 = _mm512_extractf32x4_ps(z3, u); \ - auto m2 = _mm512_extractf32x4_ps(z6, u); \ - auto m3 = _mm512_extractf32x4_ps(z9, u); \ - _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ - _mm_storeu_ps(dst + 4 * (0 + 4 * u + 16 * v), m0); \ - _mm_storeu_ps(dst + 4 * (1 + 4 * u + 16 * v), m1); \ - _mm_storeu_ps(dst + 4 * (2 + 4 * u + 16 * v), m2); \ - _mm_storeu_ps(dst + 4 * (3 + 4 * u + 16 * v), m3); \ - } - -#define AVX2_TRANSPOSE_SAVE(u, z0, z3, z6, z9, z12, z15, z18, z21) \ - { \ - auto m0 = _mm256_extractf128_ps(z0, u); \ - auto m1 = _mm256_extractf128_ps(z3, u); \ - auto m2 = _mm256_extractf128_ps(z6, u); \ - auto m3 = _mm256_extractf128_ps(z9, u); \ - auto m4 = _mm256_extractf128_ps(z12, u); \ - auto m5 = _mm256_extractf128_ps(z15, u); \ - auto m6 = _mm256_extractf128_ps(z18, u); \ - auto m7 = _mm256_extractf128_ps(z21, u); \ - _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ - _MM_TRANSPOSE4_PS(m4, m5, m6, m7); \ - _mm_storeu_ps(dst + 4 * (0 + 4 * u), m0); \ - _mm_storeu_ps(dst + 4 * (1 + 4 * u), m1); \ - _mm_storeu_ps(dst + 4 * (2 + 4 * u), m2); \ - _mm_storeu_ps(dst + 4 * (3 + 4 * u), m3); \ - _mm_storeu_ps(dst + cStride + 4 * (0 + 4 * u), m4); \ - _mm_storeu_ps(dst + cStride + 4 * (1 + 4 * u), m5); \ - _mm_storeu_ps(dst + cStride + 4 * (2 + 4 * u), m6); \ - _mm_storeu_ps(dst + cStride + 4 * (3 + 4 * u), m7); \ - } - -#define AVX2_TRANSPOSE_SAVE_HALF(u, z0, z3, z6, z9, z12, z15, z18, z21) \ - { \ - auto m0 = _mm256_extractf128_ps(z0, u); \ - auto m1 = _mm256_extractf128_ps(z3, u); \ - auto m2 = _mm256_extractf128_ps(z6, u); \ - auto m3 = _mm256_extractf128_ps(z9, u); \ - _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ - _mm_storeu_ps(dst + 4 * (0 + 4 * u), m0); \ - _mm_storeu_ps(dst + 4 * (1 + 4 * u), m1); \ - _mm_storeu_ps(dst + 4 * (2 + 4 * u), m2); \ - _mm_storeu_ps(dst + 4 * (3 + 4 * u), m3); \ - } - -#define INIT_MAIN_4_8 \ - auto s0 = _mm256_loadu_ps(weight + 0 * 8); \ - auto w0 = _mm256_broadcast_ss(A + 0 * aStride + 0); \ - auto w1 = _mm256_broadcast_ss(A + 0 * aStride + 1); \ - auto w2 = _mm256_broadcast_ss(A + 0 * aStride + 2); \ - auto w3 = _mm256_broadcast_ss(A + 0 * aStride + 3); \ - auto z0 = _mm256_mul_ps(s0, w0); \ - auto z1 = _mm256_mul_ps(s0, w1); \ - auto z2 = _mm256_mul_ps(s0, w2); \ - auto z3 = _mm256_mul_ps(s0, w3); \ - -#define COMPUTE_4_8 \ - s0 = _mm256_loadu_ps(weight + sy * 8); \ - w0 = _mm256_broadcast_ss(A + sy * aStride + 0); \ - w1 = _mm256_broadcast_ss(A + sy * aStride + 1); \ - w2 = _mm256_broadcast_ss(A + sy * aStride + 2); \ - w3 = _mm256_broadcast_ss(A + sy * aStride + 3); \ - z0 = MNNAVXFMA(s0, w0, z0); \ - z1 = MNNAVXFMA(s0, w1, z1); \ - z2 = MNNAVXFMA(s0, w2, z2); \ - z3 = MNNAVXFMA(s0, w3, z3); \ - -#define INIT_MAIN_5_8 \ - auto s0 = _mm256_loadu_ps(weight + 0 * 8); \ - auto w0 = _mm256_broadcast_ss(A + 0 * aStride + 0); \ - auto w1 = _mm256_broadcast_ss(A + 0 * aStride + 1); \ - auto w2 = _mm256_broadcast_ss(A + 0 * aStride + 2); \ - auto w3 = _mm256_broadcast_ss(A + 0 * aStride + 3); \ - auto w4 = _mm256_broadcast_ss(A + 0 * aStride + 4); \ - auto z0 = _mm256_mul_ps(s0, w0); \ - auto z1 = _mm256_mul_ps(s0, w1); \ - auto z2 = _mm256_mul_ps(s0, w2); \ - auto z3 = _mm256_mul_ps(s0, w3); \ - auto z4 = _mm256_mul_ps(s0, w4); \ - -#define COMPUTE_5_8 \ - s0 = _mm256_loadu_ps(weight + sy * 8); \ - w0 = _mm256_broadcast_ss(A + sy * aStride + 0); \ - w1 = _mm256_broadcast_ss(A + sy * aStride + 1); \ - w2 = _mm256_broadcast_ss(A + sy * aStride + 2); \ - w3 = _mm256_broadcast_ss(A + sy * aStride + 3); \ - w4 = _mm256_broadcast_ss(A + sy * aStride + 4); \ - z0 = MNNAVXFMA(s0, w0, z0); \ - z1 = MNNAVXFMA(s0, w1, z1); \ - z2 = MNNAVXFMA(s0, w2, z2); \ - z3 = MNNAVXFMA(s0, w3, z3); \ - z4 = MNNAVXFMA(s0, w4, z4); \ - - -#define INIT_MAIN_8_8 \ - auto s0 = _mm256_loadu_ps(A + 0 * aStride); \ - auto w0 = _mm256_broadcast_ss(weight + 0 * 8 + 0); \ - auto w1 = _mm256_broadcast_ss(weight + 0 * 8 + 1); \ - auto w2 = _mm256_broadcast_ss(weight + 0 * 8 + 2); \ - auto w3 = _mm256_broadcast_ss(weight + 0 * 8 + 3); \ - auto w4 = _mm256_broadcast_ss(weight + 0 * 8 + 4); \ - auto w5 = _mm256_broadcast_ss(weight + 0 * 8 + 5); \ - auto w6 = _mm256_broadcast_ss(weight + 0 * 8 + 6); \ - auto w7 = _mm256_broadcast_ss(weight + 0 * 8 + 7); \ - auto z0 = _mm256_mul_ps(s0, w0); \ - auto z1 = _mm256_mul_ps(s0, w1); \ - auto z2 = _mm256_mul_ps(s0, w2); \ - auto z3 = _mm256_mul_ps(s0, w3); \ - auto z4 = _mm256_mul_ps(s0, w4); \ - auto z5 = _mm256_mul_ps(s0, w5); \ - auto z6 = _mm256_mul_ps(s0, w6); \ - auto z7 = _mm256_mul_ps(s0, w7); - -#define COMPUTE_8_8 \ - s0 = _mm256_loadu_ps(A + sy * aStride); \ - w0 = _mm256_broadcast_ss(weight + sy * 8 + 0); \ - w1 = _mm256_broadcast_ss(weight + sy * 8 + 1); \ - w2 = _mm256_broadcast_ss(weight + sy * 8 + 2); \ - w3 = _mm256_broadcast_ss(weight + sy * 8 + 3); \ - w4 = _mm256_broadcast_ss(weight + sy * 8 + 4); \ - w5 = _mm256_broadcast_ss(weight + sy * 8 + 5); \ - w6 = _mm256_broadcast_ss(weight + sy * 8 + 6); \ - w7 = _mm256_broadcast_ss(weight + sy * 8 + 7); \ - z0 = MNNAVXFMA(s0, w0, z0); \ - z1 = MNNAVXFMA(s0, w1, z1); \ - z2 = MNNAVXFMA(s0, w2, z2); \ - z3 = MNNAVXFMA(s0, w3, z3); \ - z4 = MNNAVXFMA(s0, w4, z4); \ - z5 = MNNAVXFMA(s0, w5, z5); \ - z6 = MNNAVXFMA(s0, w6, z6); \ - z7 = MNNAVXFMA(s0, w7, z7); - -#define INIT_MAIN_16_8 \ - auto s0 = _mm512_loadu_ps(A + 0 * aStride); \ - auto wt = _mm_load_ss(weight + 0 * 8 + 0); \ - auto w0 = _mm512_broadcastss_ps(wt); \ - auto z0 = _mm512_mul_ps(s0, w0); \ - wt = _mm_load_ss(weight + 0 * 8 + 1); \ - auto w1 = _mm512_broadcastss_ps(wt); \ - auto z1 = _mm512_mul_ps(s0, w1); \ - wt = _mm_load_ss(weight + 0 * 8 + 2); \ - auto w2 = _mm512_broadcastss_ps(wt); \ - auto z2 = _mm512_mul_ps(s0, w2); \ - wt = _mm_load_ss(weight + 0 * 8 + 3); \ - auto w3 = _mm512_broadcastss_ps(wt); \ - auto z3 = _mm512_mul_ps(s0, w3); \ - wt = _mm_load_ss(weight + 0 * 8 + 4); \ - auto w4 = _mm512_broadcastss_ps(wt); \ - auto z4 = _mm512_mul_ps(s0, w4); \ - wt = _mm_load_ss(weight + 0 * 8 + 5); \ - auto w5 = _mm512_broadcastss_ps(wt); \ - auto z5 = _mm512_mul_ps(s0, w5); \ - wt = _mm_load_ss(weight + 0 * 8 + 6); \ - auto w6 = _mm512_broadcastss_ps(wt); \ - auto z6 = _mm512_mul_ps(s0, w6); \ - wt = _mm_load_ss(weight + 0 * 8 + 7); \ - auto w7 = _mm512_broadcastss_ps(wt); \ - auto z7 = _mm512_mul_ps(s0, w7); - - -#define COMPUTE_16_8 \ - s0 = _mm512_loadu_ps(A + sy * aStride); \ - wt = _mm_load_ss(weight + sy * 8 + 0); \ - w0 = _mm512_broadcastss_ps(wt); \ - z0 = MNNAVX512FMA(s0, w0, z0); \ - wt = _mm_load_ss(weight + sy * 8 + 1); \ - w1 = _mm512_broadcastss_ps(wt); \ - z1 = MNNAVX512FMA(s0, w1, z1); \ - wt = _mm_load_ss(weight + sy * 8 + 2); \ - w2 = _mm512_broadcastss_ps(wt); \ - z2 = MNNAVX512FMA(s0, w2, z2); \ - wt = _mm_load_ss(weight + sy * 8 + 3); \ - w3 = _mm512_broadcastss_ps(wt); \ - z3 = MNNAVX512FMA(s0, w3, z3); \ - wt = _mm_load_ss(weight + sy * 8 + 4); \ - w4 = _mm512_broadcastss_ps(wt); \ - z4 = MNNAVX512FMA(s0, w4, z4); \ - wt = _mm_load_ss(weight + sy * 8 + 5); \ - w5 = _mm512_broadcastss_ps(wt); \ - z5 = MNNAVX512FMA(s0, w5, z5); \ - wt = _mm_load_ss(weight + sy * 8 + 6); \ - w6 = _mm512_broadcastss_ps(wt); \ - z6 = MNNAVX512FMA(s0, w6, z6); \ - wt = _mm_load_ss(weight + sy * 8 + 7); \ - w7 = _mm512_broadcastss_ps(wt); \ - z7 = MNNAVX512FMA(s0, w7, z7); - -#define INIT_MAIN_48_8 \ - auto s0 = _mm512_loadu_ps(A + 0 * 48); \ - auto s1 = _mm512_loadu_ps(A + 0 * 48 + 16); \ - auto s2 = _mm512_loadu_ps(A + 0 * 48 + 32); \ - auto wt = _mm_load_ss(weight + 0 * 8 + 0); \ - auto w0 = _mm512_broadcastss_ps(wt); \ - auto z0 = _mm512_mul_ps(s0, w0); \ - auto z1 = _mm512_mul_ps(s1, w0); \ - auto z2 = _mm512_mul_ps(s2, w0); \ - wt = _mm_load_ss(weight + 0 * 8 + 1); \ - auto w1 = _mm512_broadcastss_ps(wt); \ - auto z3 = _mm512_mul_ps(s0, w1); \ - auto z4 = _mm512_mul_ps(s1, w1); \ - auto z5 = _mm512_mul_ps(s2, w1); \ - wt = _mm_load_ss(weight + 0 * 8 + 2); \ - auto w2 = _mm512_broadcastss_ps(wt); \ - auto z6 = _mm512_mul_ps(s0, w2); \ - auto z7 = _mm512_mul_ps(s1, w2); \ - auto z8 = _mm512_mul_ps(s2, w2); \ - wt = _mm_load_ss(weight + 0 * 8 + 3); \ - auto w3 = _mm512_broadcastss_ps(wt); \ - auto z9 = _mm512_mul_ps(s0, w3); \ - auto z10 = _mm512_mul_ps(s1, w3); \ - auto z11 = _mm512_mul_ps(s2, w3); \ - wt = _mm_load_ss(weight + 0 * 8 + 4); \ - auto w4 = _mm512_broadcastss_ps(wt); \ - auto z12 = _mm512_mul_ps(s0, w4); \ - auto z13 = _mm512_mul_ps(s1, w4); \ - auto z14 = _mm512_mul_ps(s2, w4); \ - wt = _mm_load_ss(weight + 0 * 8 + 5); \ - auto w5 = _mm512_broadcastss_ps(wt); \ - auto z15 = _mm512_mul_ps(s0, w5); \ - auto z16 = _mm512_mul_ps(s1, w5); \ - auto z17 = _mm512_mul_ps(s2, w5); \ - wt = _mm_load_ss(weight + 0 * 8 + 6); \ - auto w6 = _mm512_broadcastss_ps(wt); \ - auto z18 = _mm512_mul_ps(s0, w6); \ - auto z19 = _mm512_mul_ps(s1, w6); \ - auto z20 = _mm512_mul_ps(s2, w6); \ - wt = _mm_load_ss(weight + 0 * 8 + 7); \ - auto w7 = _mm512_broadcastss_ps(wt); \ - auto z21 = _mm512_mul_ps(s0, w7); \ - auto z22 = _mm512_mul_ps(s1, w7); \ - auto z23 = _mm512_mul_ps(s2, w7); - -#define COMPUTE_48_8 \ - s0 = _mm512_loadu_ps(A + sy * 48); \ - s1 = _mm512_loadu_ps(A + sy * 48 + 16); \ - s2 = _mm512_loadu_ps(A + sy * 48 + 32); \ - wt = _mm_load_ss(weight + sy * 8 + 0); \ - w0 = _mm512_broadcastss_ps(wt); \ - z0 = MNNAVX512FMA(s0, w0, z0); \ - z1 = MNNAVX512FMA(s1, w0, z1); \ - z2 = MNNAVX512FMA(s2, w0, z2); \ - wt = _mm_load_ss(weight + sy * 8 + 1); \ - w1 = _mm512_broadcastss_ps(wt); \ - z3 = MNNAVX512FMA(s0, w1, z3); \ - z4 = MNNAVX512FMA(s1, w1, z4); \ - z5 = MNNAVX512FMA(s2, w1, z5); \ - wt = _mm_load_ss(weight + sy * 8 + 2); \ - w2 = _mm512_broadcastss_ps(wt); \ - z6 = MNNAVX512FMA(s0, w2, z6); \ - z7 = MNNAVX512FMA(s1, w2, z7); \ - z8 = MNNAVX512FMA(s2, w2, z8); \ - wt = _mm_load_ss(weight + sy * 8 + 3); \ - w3 = _mm512_broadcastss_ps(wt); \ - z9 = MNNAVX512FMA(s0, w3, z9); \ - z10 = MNNAVX512FMA(s1, w3, z10); \ - z11 = MNNAVX512FMA(s2, w3, z11); \ - wt = _mm_load_ss(weight + sy * 8 + 4); \ - w4 = _mm512_broadcastss_ps(wt); \ - z12 = MNNAVX512FMA(s0, w4, z12); \ - z13 = MNNAVX512FMA(s1, w4, z13); \ - z14 = MNNAVX512FMA(s2, w4, z14); \ - wt = _mm_load_ss(weight + sy * 8 + 5); \ - w5 = _mm512_broadcastss_ps(wt); \ - z15 = MNNAVX512FMA(s0, w5, z15); \ - z16 = MNNAVX512FMA(s1, w5, z16); \ - z17 = MNNAVX512FMA(s2, w5, z17); \ - wt = _mm_load_ss(weight + sy * 8 + 6); \ - w6 = _mm512_broadcastss_ps(wt); \ - z18 = MNNAVX512FMA(s0, w6, z18); \ - z19 = MNNAVX512FMA(s1, w6, z19); \ - z20 = MNNAVX512FMA(s2, w6, z20); \ - wt = _mm_load_ss(weight + sy * 8 + 7); \ - w7 = _mm512_broadcastss_ps(wt); \ - z21 = MNNAVX512FMA(s0, w7, z21); \ - z22 = MNNAVX512FMA(s1, w7, z22); \ - z23 = MNNAVX512FMA(s2, w7, z23); - // TODO: this function is not implemented for avx512 yet. void AVX512GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) { @@ -399,651 +87,319 @@ void AVX512GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const } } -static void _AVX512_MNNPackedMatMul_48(float* C, const float* A, const float* B, const size_t* parameter) { - auto h = parameter[2]; - auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); - auto bStride = bExtraStride + l * 8; //hP=8 - auto hC4 = UP_DIV(h, 4); - auto hC8 = hC4 / 2; - auto hR = hC4 % 2; +#ifdef MNN_X86_USE_ASM +extern "C" { +void _AVX512_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4); +void _AVX512_MNNGemmFloatUnit16(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4); +} +#endif - for (int y = 0; y < hC8; ++y) { - auto weight = B + y * bStride; - auto dst = C + 2 * y * cStride; - INIT_MAIN_48_8; - for (int sy = 1; sy < l; ++sy) { - COMPUTE_48_8; +void _AVX512_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int eDest = info[2]; + int offset = info[3]; + int pOffset = 4 * offset; + + for (int n=0; n 0) { - auto weight = B + hC8 * bStride; - auto dst = C + 2 * hC8 * cStride; - INIT_MAIN_48_8; - - for (int sy = 1; sy < l; ++sy) { - COMPUTE_48_8; + for (int x=0; x 0) { + float temp[16]; + ::memset(temp, 0, sizeof(float) * 16); + ::memcpy(temp + 4 * 0, srcY0, lR * sizeof(float)); + ::memcpy(temp + 4 * 1, srcY1, lR * sizeof(float)); + ::memcpy(temp + 4 * 2, srcY2, lR * sizeof(float)); + ::memcpy(temp + 4 * 3, srcY3, lR * sizeof(float)); + ::memcpy(dstY, temp, sizeof(float) * 16); + } } - AVX512_TRANSPOSE_SAVE(0, 0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX512_TRANSPOSE_SAVE(1, 0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX512_TRANSPOSE_SAVE(2, 0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX512_TRANSPOSE_SAVE(3, 0, z0, z1, z2, z3, z4, z5, z6, z7); - } - if (hR > 0) { - auto weight = B + hC8 * bStride; - auto dst = C + 2 * hC8 * cStride; - INIT_MAIN_16_8; - - for (int sy = 1; sy < l; ++sy) { - COMPUTE_16_8; + if (hR > 0) { + auto srcY0 = source + (4 * h4 + 0) * l; + auto srcY1 = source + (4 * h4 + 1) * l; + auto srcY2 = source + (4 * h4 + 2) * l; + auto dstY = dest + 16 * h4 * lC4; + auto zero = _mm_set1_ps(0.0f); + switch (hR) { + case 3: { + for (int x = 0; x < l4; ++x) { + _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0)); + _mm_storeu_ps(dstY + 4 * 1, _mm_loadu_ps(srcY1)); + _mm_storeu_ps(dstY + 4 * 2, _mm_loadu_ps(srcY2)); + _mm_storeu_ps(dstY + 4 * 3, zero); + srcY0 += 4; + srcY1 += 4; + srcY2 += 4; + dstY += 16; + } + break; + } + case 2: { + for (int x = 0; x < l4; ++x) { + _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0)); + _mm_storeu_ps(dstY + 4 * 1, _mm_loadu_ps(srcY1)); + _mm_storeu_ps(dstY + 4 * 2, zero); + _mm_storeu_ps(dstY + 4 * 3, zero); + srcY0 += 4; + srcY1 += 4; + dstY += 16; + } + break; + } + case 1: { + for (int x = 0; x < l4; ++x) { + _mm_storeu_ps(dstY + 4 * 0, _mm_loadu_ps(srcY0)); + _mm_storeu_ps(dstY + 4 * 1, zero); + _mm_storeu_ps(dstY + 4 * 2, zero); + _mm_storeu_ps(dstY + 4 * 3, zero); + srcY0 += 4; + dstY += 16; + } + break; + } + default: + break; + } + if (lR > 0) { + float temp[16]; + ::memset(temp, 0, sizeof(float) * 16); + ::memcpy(temp + 4 * 0, srcY0, lR * sizeof(float)); + if (hR >= 1) { + ::memcpy(temp + 4 * 1, srcY1, lR * sizeof(float)); + } + if (hR >= 2) { + ::memcpy(temp + 4 * 2, srcY2, lR * sizeof(float)); + } + ::memcpy(dstY, temp, sizeof(float) * 16); + } } - AVX512_TRANSPOSE_SAVE_HALF(0, 0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX512_TRANSPOSE_SAVE_HALF(1, 0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX512_TRANSPOSE_SAVE_HALF(2, 0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX512_TRANSPOSE_SAVE_HALF(3, 0, z0, z1, z2, z3, z4, z5, z6, z7); - } -} - -static void _AVX2_MNNPackedMatMul_8(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); - auto h = parameter[2]; - auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); - auto bStride = bExtraStride + l * 8; - auto hC4 = UP_DIV(h, 4); - auto hC8 = hC4 / 2; - auto hR = hC4 % 2; - - for (int y = 0; y < hC8; ++y) { - auto weight = B + y * bStride; - auto dst = C + 2 * y * cStride; - INIT_MAIN_8_8; - - for (int sy = 1; sy < l; ++sy) { - COMPUTE_8_8; - } - AVX2_TRANSPOSE_SAVE(0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX2_TRANSPOSE_SAVE(1, z0, z1, z2, z3, z4, z5, z6, z7); - } - if (hR > 0) { - auto weight = B + hC8 * bStride; - auto dst = C + 2 * hC8 * cStride; - INIT_MAIN_8_8; - - for (int sy = 1; sy < l; ++sy) { - COMPUTE_8_8; - } - AVX2_TRANSPOSE_SAVE_HALF(0, z0, z1, z2, z3, z4, z5, z6, z7); - AVX2_TRANSPOSE_SAVE_HALF(1, z0, z1, z2, z3, z4, z5, z6, z7); - } -} - -static void _AVX2_MNNPackedMatMul_5(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); - auto h = parameter[2]; - auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); - auto bStride = bExtraStride + l * 8; - auto hC4 = UP_DIV(h, 4); - auto hC8 = hC4 / 2; - auto hR = hC4 % 2; - auto lC2 = l / 2; - auto lR = l % 2; - - for (int y = 0; y < hC8; ++y) { - auto Z0 = _mm512_setzero_ps(); - auto Z1 = _mm512_setzero_ps(); - auto Z2 = _mm512_setzero_ps(); - auto Z3 = _mm512_setzero_ps(); - auto Z4 = _mm512_setzero_ps(); - auto a = A; - for (int sy = 0; sy < lC2; ++sy) { - auto W = _mm512_loadu_ps(B + 16 * sy); - auto s00 = _mm256_broadcast_ss(a); - auto s01 = _mm256_broadcast_ss(a + aStride); - auto S0 = _mm512_insertf32x8(_mm512_castps256_ps512(s00), s01, 1); - - auto s10 = _mm256_broadcast_ss(a + 1); - auto s11 = _mm256_broadcast_ss(a + 1 + aStride); - auto S1 = _mm512_insertf32x8(_mm512_castps256_ps512(s10), s11, 1); - - auto s20 = _mm256_broadcast_ss(a + 2); - auto s21 = _mm256_broadcast_ss(a + 2 + aStride); - auto S2 = _mm512_insertf32x8(_mm512_castps256_ps512(s20), s21, 1); - - auto s30 = _mm256_broadcast_ss(a + 3); - auto s31 = _mm256_broadcast_ss(a + 3 + aStride); - auto S3 = _mm512_insertf32x8(_mm512_castps256_ps512(s30), s31, 1); - - auto s40 = _mm256_broadcast_ss(a + 4); - auto s41 = _mm256_broadcast_ss(a + 4 + aStride); - auto S4 = _mm512_insertf32x8(_mm512_castps256_ps512(s40), s41, 1); - - Z0 = MNNAVX512FMA(S0, W, Z0); - Z1 = MNNAVX512FMA(S1, W, Z1); - Z2 = MNNAVX512FMA(S2, W, Z2); - Z3 = MNNAVX512FMA(S3, W, Z3); - Z4 = MNNAVX512FMA(S4, W, Z4); - - a += 2 * aStride; - } - auto z0 = _mm256_add_ps(_mm512_extractf32x8_ps(Z0, 0), _mm512_extractf32x8_ps(Z0, 1)); - auto z1 = _mm256_add_ps(_mm512_extractf32x8_ps(Z1, 0), _mm512_extractf32x8_ps(Z1, 1)); - auto z2 = _mm256_add_ps(_mm512_extractf32x8_ps(Z2, 0), _mm512_extractf32x8_ps(Z2, 1)); - auto z3 = _mm256_add_ps(_mm512_extractf32x8_ps(Z3, 0), _mm512_extractf32x8_ps(Z3, 1)); - auto z4 = _mm256_add_ps(_mm512_extractf32x8_ps(Z4, 0), _mm512_extractf32x8_ps(Z4, 1)); - if (lR > 0) { - int sy = l - 1; - __m256 s0; - __m256 w0; - __m256 w1; - __m256 w2; - __m256 w3; - __m256 w4; - auto weight = B; - COMPUTE_5_8; - } - auto p0 = _mm256_permute2f128_ps(z0, z1, 32); - auto p2 = _mm256_permute2f128_ps(z0, z1, 49); - auto p1 = _mm256_permute2f128_ps(z2, z3, 32); - auto p3 = _mm256_permute2f128_ps(z2, z3, 49); - auto p4 = _mm256_extractf128_ps(z4, 0); - auto p5 = _mm256_extractf128_ps(z4, 1); - _mm256_storeu_ps(C + 8 * 0, p0); - _mm256_storeu_ps(C + 8 * 1, p1); - _mm_storeu_ps(C + 8 * 2, p4); - _mm256_storeu_ps(C + 8 * 0 + cStride, p2); - _mm256_storeu_ps(C + 8 * 1 + cStride, p3); - _mm_storeu_ps(C + 8 * 2 + cStride, p5); - - B += bStride; - C += 2 * cStride; - } - if (hR > 0) { - auto weight = B; - auto dst = C; - INIT_MAIN_5_8; - - for (int sy = 1; sy < l; ++sy) { - COMPUTE_5_8; - } - auto p0 = _mm256_permute2f128_ps(z0, z1, 32); - auto p2 = _mm256_permute2f128_ps(z0, z1, 49); - auto p1 = _mm256_permute2f128_ps(z2, z3, 32); - auto p3 = _mm256_permute2f128_ps(z2, z3, 49); - auto p4 = _mm256_extractf128_ps(z4, 0); - _mm256_storeu_ps(dst + 8 * 0, p0); - _mm256_storeu_ps(dst + 8 * 1, p1); - _mm_storeu_ps(dst + 8 * 2, p4); - } -} - -static void _AVX2_MNNPackedMatMul_4(float* C, const float* A, const float* B, const size_t* parameter) { - auto aStride = parameter[0] / sizeof(float); - auto h = parameter[2]; - auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); - auto bStride = bExtraStride + l * 8; - auto hC4 = UP_DIV(h, 4); - auto hC8 = hC4 / 2; - auto hR = hC4 % 2; - - for (int y = 0; y < hC8; ++y) { - auto weight = B + y * bStride; - auto dst = C + 2 * y * cStride; - INIT_MAIN_4_8; - for (int sy = 1; sy < l; ++sy) { - COMPUTE_4_8; - } - auto p0 = _mm256_permute2f128_ps(z0, z1, 32); - auto p2 = _mm256_permute2f128_ps(z0, z1, 49); - auto p1 = _mm256_permute2f128_ps(z2, z3, 32); - auto p3 = _mm256_permute2f128_ps(z2, z3, 49); - _mm256_storeu_ps(dst + 8 * 0, p0); - _mm256_storeu_ps(dst + 8 * 1, p1); - _mm256_storeu_ps(dst + cStride + 8 * 0, p2); - _mm256_storeu_ps(dst + cStride + 8 * 1, p3); - } - if (hR > 0) { - auto weight = B + hC8 * bStride; - auto dst = C + 2 * hC8 * cStride; - INIT_MAIN_4_8; - - for (int sy = 1; sy < l; ++sy) { - COMPUTE_4_8; - } - auto p0 = _mm256_permute2f128_ps(z0, z1, 32); - auto p2 = _mm256_permute2f128_ps(z0, z1, 49); - auto p1 = _mm256_permute2f128_ps(z2, z3, 32); - auto p3 = _mm256_permute2f128_ps(z2, z3, 49); - _mm256_storeu_ps(dst + 8 * 0, p0); - _mm256_storeu_ps(dst + 8 * 1, p1); - } -} - -static void _AVX512_MNNPackednMatMulRemainCommon(float* C, const float* A, const float* B, size_t eSize, - const size_t* parameter, float* cache, const float* postParameters, - const float* bias) { - auto h = parameter[2]; - auto l = parameter[1]; - auto cStride = parameter[3] / sizeof(float); - auto bExtraStride = parameter[5] / sizeof(float); - auto bStride = bExtraStride + l * 8; - auto hC4 = UP_DIV(h, 4); - auto hC8 = hC4 / 2; - auto hR = hC4 % 2; - auto es = eSize; - auto oC = C; - auto aStride = parameter[0] / sizeof(float); - - while (eSize >= 16) { - _AVX512_MNNPackedMatMul_16(C, A, B, parameter); - eSize -= 16; - C += 16 * 4; - A += 16; - } - - while (eSize >= 8) { - _AVX2_MNNPackedMatMul_8(C, A, B, parameter); - eSize -= 8; - C += 8 * 4; - A += 8; - } - if (eSize >= 5) { - _AVX2_MNNPackedMatMul_5(C, A, B, parameter); - eSize -= 5; - C += 5 * 4; - A += 5; - } - if (eSize >= 4) { - _AVX2_MNNPackedMatMul_4(C, A, B, parameter); - eSize -= 4; - C += 4 * 4; - A += 4; - } - - if (eSize == 0) { return; } - - int valid = 1 << 31; - __m128i mask; - switch (eSize) { - case 1: - mask = _mm_set_epi32(0, 0, 0, valid); - break; - case 2: - mask = _mm_set_epi32(0, 0, valid, valid); - break; - case 3: - mask = _mm_set_epi32(0, valid, valid, valid); - break; - } - - //TODO: further optimize - // Remain x = 1..3 - for (int y = 0; y < hC8; y++) { - auto weight = B + y * bStride; - auto dst = C + 2 * y * cStride; - //INIT_MAIN_x_8 - auto s0 = _mm_maskload_ps(A + 0 * aStride, mask); - auto w0 = _mm_broadcast_ss(weight + 0 * 8 + 0); - auto w1 = _mm_broadcast_ss(weight + 0 * 8 + 1); - auto w2 = _mm_broadcast_ss(weight + 0 * 8 + 2); - auto w3 = _mm_broadcast_ss(weight + 0 * 8 + 3); - auto w4 = _mm_broadcast_ss(weight + 0 * 8 + 4); - auto w5 = _mm_broadcast_ss(weight + 0 * 8 + 5); - auto w6 = _mm_broadcast_ss(weight + 0 * 8 + 6); - auto w7 = _mm_broadcast_ss(weight + 0 * 8 + 7); - auto z0 = _mm_mul_ps(s0, w0); - auto z1 = _mm_mul_ps(s0, w1); - auto z2 = _mm_mul_ps(s0, w2); - auto z3 = _mm_mul_ps(s0, w3); - auto z4 = _mm_mul_ps(s0, w4); - auto z5 = _mm_mul_ps(s0, w5); - auto z6 = _mm_mul_ps(s0, w6); - auto z7 = _mm_mul_ps(s0, w7); - //COMPUTE_x_8 - for (int sy = 1; sy < l; sy++) { - s0 = _mm_maskload_ps(A + sy * aStride, mask); - w0 = _mm_broadcast_ss(weight + sy * 8 + 0); - w1 = _mm_broadcast_ss(weight + sy * 8 + 1); - w2 = _mm_broadcast_ss(weight + sy * 8 + 2); - w3 = _mm_broadcast_ss(weight + sy * 8 + 3); - w4 = _mm_broadcast_ss(weight + sy * 8 + 4); - w5 = _mm_broadcast_ss(weight + sy * 8 + 5); - w6 = _mm_broadcast_ss(weight + sy * 8 + 6); - w7 = _mm_broadcast_ss(weight + sy * 8 + 7); - z0 = MNNSSEFMA(s0, w0, z0); - z1 = MNNSSEFMA(s0, w1, z1); - z2 = MNNSSEFMA(s0, w2, z2); - z3 = MNNSSEFMA(s0, w3, z3); - z4 = MNNSSEFMA(s0, w4, z4); - z5 = MNNSSEFMA(s0, w5, z5); - z6 = MNNSSEFMA(s0, w6, z6); - z7 = MNNSSEFMA(s0, w7, z7); + + + // No Transpose + for (int x = 0; x < l4; ++x) { + auto srcX0 = source + (4 * x + 0) * h; + auto srcX1 = source + (4 * x + 1) * h; + auto srcX2 = source + (4 * x + 2) * h; + auto srcX3 = source + (4 * x + 3) * h; + auto dstX = dest + 16 * x; + for (int y = 0; y < h4; ++y) { + auto p0 = _mm_loadu_ps(srcX0); + auto p1 = _mm_loadu_ps(srcX1); + auto p2 = _mm_loadu_ps(srcX2); + auto p3 = _mm_loadu_ps(srcX3); + _MM_TRANSPOSE4_PS(p0, p1, p2, p3); + _mm_storeu_ps(dstX + 4 * 0, p0); + _mm_storeu_ps(dstX + 4 * 1, p1); + _mm_storeu_ps(dstX + 4 * 2, p2); + _mm_storeu_ps(dstX + 4 * 3, p3); + srcX0 += 4; + srcX1 += 4; + srcX2 += 4; + srcX3 += 4; + dstX += 16 * lC4; } - //TRANSPOSE_SAVE - _MM_TRANSPOSE4_PS(z0, z1, z2, z3); - _MM_TRANSPOSE4_PS(z4, z5, z6, z7); - if (eSize == 1) { - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + cStride + 4 * 0, z4); - } else if (eSize == 2) { - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + 4 * 1, z1); - _mm_storeu_ps(dst + cStride + 4 * 0, z4); - _mm_storeu_ps(dst + cStride + 4 * 1, z5); - } else { - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + 4 * 1, z1); - _mm_storeu_ps(dst + 4 * 2, z2); - _mm_storeu_ps(dst + cStride + 4 * 0, z4); - _mm_storeu_ps(dst + cStride + 4 * 1, z5); - _mm_storeu_ps(dst + cStride + 4 * 2, z6); + if (hR > 0) { + float temp[16]; + ::memset(temp, 0, sizeof(float) * 16); + ::memcpy(temp + 4 * 0, srcX0, hR * sizeof(float)); + ::memcpy(temp + 4 * 1, srcX1, hR * sizeof(float)); + ::memcpy(temp + 4 * 2, srcX2, hR * sizeof(float)); + ::memcpy(temp + 4 * 3, srcX3, hR * sizeof(float)); + auto p0 = _mm_loadu_ps(temp + 4 * 0); + auto p1 = _mm_loadu_ps(temp + 4 * 1); + auto p2 = _mm_loadu_ps(temp + 4 * 2); + auto p3 = _mm_loadu_ps(temp + 4 * 3); + _MM_TRANSPOSE4_PS(p0, p1, p2, p3); + _mm_storeu_ps(dstX + 4 * 0, p0); + _mm_storeu_ps(dstX + 4 * 1, p1); + _mm_storeu_ps(dstX + 4 * 2, p2); + _mm_storeu_ps(dstX + 4 * 3, p3); } } - if (hR > 0) { - auto weight = B + hC8 * bStride; - auto dst = C + 2 * hC8 * cStride; - auto s0 = _mm_maskload_ps(A + 0 * aStride, mask); - auto w0 = _mm_broadcast_ss(weight + 0 * 8 + 0); - auto w1 = _mm_broadcast_ss(weight + 0 * 8 + 1); - auto w2 = _mm_broadcast_ss(weight + 0 * 8 + 2); - auto w3 = _mm_broadcast_ss(weight + 0 * 8 + 3); - auto z0 = _mm_mul_ps(s0, w0); - auto z1 = _mm_mul_ps(s0, w1); - auto z2 = _mm_mul_ps(s0, w2); - auto z3 = _mm_mul_ps(s0, w3); - //COMPUTE_x_8 - for (int sy = 1; sy < l; sy++) { - s0 = _mm_maskload_ps(A + sy * aStride, mask); - w0 = _mm_broadcast_ss(weight + sy * 8 + 0); - w1 = _mm_broadcast_ss(weight + sy * 8 + 1); - w2 = _mm_broadcast_ss(weight + sy * 8 + 2); - w3 = _mm_broadcast_ss(weight + sy * 8 + 3); - z0 = MNNSSEFMA(s0, w0, z0); - z1 = MNNSSEFMA(s0, w1, z1); - z2 = MNNSSEFMA(s0, w2, z2); - z3 = MNNSSEFMA(s0, w3, z3); + if (lR > 0) { + auto zero = _mm_set1_ps(0.0f); + auto srcX0 = source + (4 * l4 + 0) * h; + auto srcX1 = source + (4 * l4 + 1) * h; + auto srcX2 = source + (4 * l4 + 2) * h; + auto dstX = dest + 16 * l4; + switch (lR) { + case 3: { + for (int y = 0; y < h4; ++y) { + auto p0 = _mm_loadu_ps(srcX0); + auto p1 = _mm_loadu_ps(srcX1); + auto p2 = _mm_loadu_ps(srcX2); + auto p3 = zero; + _MM_TRANSPOSE4_PS(p0, p1, p2, p3); + _mm_storeu_ps(dstX + 4 * 0, p0); + _mm_storeu_ps(dstX + 4 * 1, p1); + _mm_storeu_ps(dstX + 4 * 2, p2); + _mm_storeu_ps(dstX + 4 * 3, p3); + srcX0 += 4; + srcX1 += 4; + srcX2 += 4; + dstX += 16 * lC4; + } + break; + } + case 2: { + for (int y = 0; y < h4; ++y) { + auto p0 = _mm_loadu_ps(srcX0); + auto p1 = _mm_loadu_ps(srcX1); + auto p2 = zero; + auto p3 = zero; + _MM_TRANSPOSE4_PS(p0, p1, p2, p3); + _mm_storeu_ps(dstX + 4 * 0, p0); + _mm_storeu_ps(dstX + 4 * 1, p1); + _mm_storeu_ps(dstX + 4 * 2, p2); + _mm_storeu_ps(dstX + 4 * 3, p3); + srcX0 += 4; + srcX1 += 4; + srcX2 += 4; + dstX += 16 * lC4; + } + break; + } + case 1: { + for (int y = 0; y < h4; ++y) { + auto p0 = _mm_loadu_ps(srcX0); + auto p1 = zero; + auto p2 = zero; + auto p3 = zero; + _MM_TRANSPOSE4_PS(p0, p1, p2, p3); + _mm_storeu_ps(dstX + 4 * 0, p0); + _mm_storeu_ps(dstX + 4 * 1, p1); + _mm_storeu_ps(dstX + 4 * 2, p2); + _mm_storeu_ps(dstX + 4 * 3, p3); + srcX0 += 4; + srcX1 += 4; + srcX2 += 4; + dstX += 16 * lC4; + } + break; + } + default: + break; } - //TRANSPOSE_SAVE - _MM_TRANSPOSE4_PS(z0, z1, z2, z3); - if (eSize == 1) { - _mm_storeu_ps(dst + 4 * 0, z0); - } else if (eSize == 2) { - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + 4 * 1, z1); - } else { - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + 4 * 1, z1); - _mm_storeu_ps(dst + 4 * 2, z2); + if (hR > 0) { + float temp[16]; + ::memset(temp, 0, sizeof(float) * 16); + ::memcpy(temp + 4 * 0, srcX0, hR * sizeof(float)); + if (lR > 1) { + ::memcpy(temp + 4 * 1, srcX1, hR * sizeof(float)); + } + if (lR > 2) { + ::memcpy(temp + 4 * 2, srcX2, hR * sizeof(float)); + } + auto p0 = _mm_loadu_ps(temp + 4 * 0); + auto p1 = _mm_loadu_ps(temp + 4 * 1); + auto p2 = _mm_loadu_ps(temp + 4 * 2); + auto p3 = _mm_loadu_ps(temp + 4 * 3); + _MM_TRANSPOSE4_PS(p0, p1, p2, p3); + _mm_storeu_ps(dstX + 4 * 0, p0); + _mm_storeu_ps(dstX + 4 * 1, p1); + _mm_storeu_ps(dstX + 4 * 2, p2); + _mm_storeu_ps(dstX + 4 * 3, p3); } } } -void _AVX512_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) { -#define MAIN_COMPUTE \ - auto s00 = _mm_loadu_ps(srcX + 0 * 4); \ - auto s01 = _mm_loadu_ps(srcX + 1 * 4); \ - auto s02 = _mm_loadu_ps(srcX + 2 * 4); \ - auto s03 = _mm_loadu_ps(srcX + 3 * 4); \ - auto s10 = _mm_loadu_ps(srcX + 4 * 4); \ - auto s11 = _mm_loadu_ps(srcX + 5 * 4); \ - auto s12 = _mm_loadu_ps(srcX + 6 * 4); \ - auto s13 = _mm_loadu_ps(srcX + 7 * 4); \ - auto s20 = _mm_loadu_ps(srcX + 8 * 4); \ - auto s21 = _mm_loadu_ps(srcX + 9 * 4); \ - auto s22 = _mm_loadu_ps(srcX + 10 * 4); \ - auto s23 = _mm_loadu_ps(srcX + 11 * 4); \ - auto s30 = _mm_loadu_ps(srcX + 12 * 4); \ - auto s31 = _mm_loadu_ps(srcX + 13 * 4); \ - auto s32 = _mm_loadu_ps(srcX + 14 * 4); \ - auto s33 = _mm_loadu_ps(srcX + 15 * 4); \ - auto s40 = _mm_loadu_ps(srcX + 16 * 4); \ - auto s41 = _mm_loadu_ps(srcX + 17 * 4); \ - auto s42 = _mm_loadu_ps(srcX + 18 * 4); \ - auto s43 = _mm_loadu_ps(srcX + 19 * 4); \ - auto s50 = _mm_loadu_ps(srcX + 20 * 4); \ - auto s51 = _mm_loadu_ps(srcX + 21 * 4); \ - auto s52 = _mm_loadu_ps(srcX + 22 * 4); \ - auto s53 = _mm_loadu_ps(srcX + 23 * 4); \ - auto s60 = _mm_loadu_ps(srcX + 24 * 4); \ - auto s61 = _mm_loadu_ps(srcX + 25 * 4); \ - auto s62 = _mm_loadu_ps(srcX + 26 * 4); \ - auto s63 = _mm_loadu_ps(srcX + 27 * 4); \ - auto s70 = _mm_loadu_ps(srcX + 28 * 4); \ - auto s71 = _mm_loadu_ps(srcX + 29 * 4); \ - auto s72 = _mm_loadu_ps(srcX + 30 * 4); \ - auto s73 = _mm_loadu_ps(srcX + 31 * 4); \ - auto s80 = _mm_loadu_ps(srcX + 32 * 4); \ - auto s81 = _mm_loadu_ps(srcX + 33 * 4); \ - auto s82 = _mm_loadu_ps(srcX + 34 * 4); \ - auto s83 = _mm_loadu_ps(srcX + 35 * 4); \ - auto s90 = _mm_loadu_ps(srcX + 36 * 4); \ - auto s91 = _mm_loadu_ps(srcX + 37 * 4); \ - auto s92 = _mm_loadu_ps(srcX + 38 * 4); \ - auto s93 = _mm_loadu_ps(srcX + 39 * 4); \ - auto s100 = _mm_loadu_ps(srcX + 40 * 4); \ - auto s101 = _mm_loadu_ps(srcX + 41 * 4); \ - auto s102 = _mm_loadu_ps(srcX + 42 * 4); \ - auto s103 = _mm_loadu_ps(srcX + 43 * 4); \ - auto s110 = _mm_loadu_ps(srcX + 44 * 4); \ - auto s111 = _mm_loadu_ps(srcX + 45 * 4); \ - auto s112 = _mm_loadu_ps(srcX + 46 * 4); \ - auto s113 = _mm_loadu_ps(srcX + 47 * 4); \ - _MM_TRANSPOSE4_PS(s00, s01, s02, s03); \ - _MM_TRANSPOSE4_PS(s10, s11, s12, s13); \ - _MM_TRANSPOSE4_PS(s20, s21, s22, s23); \ - _MM_TRANSPOSE4_PS(s30, s31, s32, s33); \ - _MM_TRANSPOSE4_PS(s40, s41, s42, s43); \ - _MM_TRANSPOSE4_PS(s50, s51, s52, s53); \ - _MM_TRANSPOSE4_PS(s60, s61, s62, s63); \ - _MM_TRANSPOSE4_PS(s70, s71, s72, s73); \ - _MM_TRANSPOSE4_PS(s80, s81, s82, s83); \ - _MM_TRANSPOSE4_PS(s90, s91, s92, s93); \ - _MM_TRANSPOSE4_PS(s100, s101, s102, s103); \ - _MM_TRANSPOSE4_PS(s110, s111, s112, s113); - -#define STORE_TEMP(i) \ - _mm_storeu_ps(dstX + 4 * (12 * i + 0), s##0##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 1), s##1##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 2), s##2##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 3), s##3##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 4), s##4##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 5), s##5##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 6), s##6##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 7), s##7##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 8), s##8##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 9), s##9##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 10), s##10##i); \ - _mm_storeu_ps(dstX + 4 * (12 * i + 11), s##11##i); - - const int pack = 48; //eP=48 Hardcode here? - const int packC4 = pack / 4; - auto ePack = e / pack; - auto lC4 = l / 4; - auto lDiv = UP_DIV(l, 4); - auto eRemain = ePack * pack; - auto lRemain = lC4 * 4; - auto lRes = l - lRemain; - for (int y = 0; y < ePack; ++y) { - auto dstY = dest + y * l * pack; - auto srcY = source + y * pack * 4; - for (int x = 0; x < lC4; ++x) { - auto srcX = srcY + x * 4 * eReal; - auto dstX = dstY + x * pack * 4; - MAIN_COMPUTE; - - STORE_TEMP(0); - STORE_TEMP(1); - STORE_TEMP(2); - STORE_TEMP(3); - } - } - auto lastLc4Src = source + lC4 * 4 * eReal; - auto lastLc4Dst = dest + lC4 * pack * 4; - if (lRes == 3) { - for (int y = 0; y < ePack; ++y) { - auto dstX = lastLc4Dst + y * l * pack; - auto srcX = lastLc4Src + y * pack * 4; - MAIN_COMPUTE; - STORE_TEMP(0); - STORE_TEMP(1); - STORE_TEMP(2); - } - } else if (lRes == 2) { - for (int y = 0; y < ePack; ++y) { - auto dstX = lastLc4Dst + y * l * pack; - auto srcX = lastLc4Src + y * pack * 4; - MAIN_COMPUTE; - STORE_TEMP(0); - STORE_TEMP(1); - } - } else if (lRes == 1) { - for (int y = 0; y < ePack; ++y) { - auto dstX = lastLc4Dst + y * l * pack; - auto srcX = lastLc4Src + y * pack * 4; - MAIN_COMPUTE; - STORE_TEMP(0); - } - } - // Down - { - auto eLast = e - eRemain; - auto lastDest = dest + ePack * pack * l; - for (int xC = 0; xC < lC4; ++xC) { - for (int y = eRemain; y < e; ++y) { - auto yR = y - eRemain; - for (int xR = 0; xR < 4; ++xR) { - lastDest[(xC * 4 + xR) * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR]; - } - } - } - for (int x = lC4 * 4; x < l; ++x) { - auto xR = x % 4; - auto xC = lC4; - for (int y = eRemain; y < e; ++y) { - auto yR = y - eRemain; - lastDest[x * eLast + yR] = source[xC * eReal * 4 + y * 4 + xR]; - } - } - } -} - -void _AVX512_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) { - { - auto hP = h / 8; - auto hR = hP * 8; - if (hR != h) { - ::memset(dest, 0, UP_DIV(h, 8) * 8 * l * sizeof(float)); - } - if (!transpose) { - for (int y=0; y 0) { - auto destY = dest + hP * 8 * l; - auto sourceY = source + hP * 8; - for (int x=0; x= 20) { + _AVX512_MNNPackedMatMul_20_4(C, A, B, parameter); + eSize -= 20; + C += 20 * 4; + A += 20 * 4; + } + if (eSize >= 16) { + _AVX512_MNNPackedMatMul_16_4(C, A, B, parameter); + eSize -= 16; + C += 16 * 4; + A += 16 * 4; + } + if (eSize >= 12) { + _AVX512_MNNPackedMatMul_12_4(C, A, B, parameter); + eSize -= 12; + C += 12 * 4; + A += 12 * 4; + } + if (eSize >= 8) { + _AVX512_MNNPackedMatMul_8_4(C, A, B, parameter); + eSize -= 8; + C += 8 * 4; + A += 8 * 4; + } + if (eSize >= 5) { + _AVX512_MNNPackedMatMul_5_4(C, A, B, parameter); + eSize -= 5; + C += 5 * 4; + A += 5 * 4; + } + if (eSize >= 4) { + _AVX512_MNNPackedMatMul_4_4(C, A, B, parameter); + eSize -= 4; + C += 4 * 4; + A += 4 * 4; + } + if (eSize >= 3) { + _AVX512_MNNPackedMatMul_3_4(C, A, B, parameter); + eSize -= 3; + C += 3 * 4; + A += 3 * 4; + } + if (eSize >= 2) { + _AVX512_MNNPackedMatMul_2_4(C, A, B, parameter); + eSize -= 2; + C += 2 * 4; + A += 2 * 4; + } + if (eSize >= 1) { + _AVX512_MNNPackedMatMul_1_4(C, A, B, parameter); + eSize -= 1; + return; + } +} diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16.S new file mode 100644 index 00000000..0a092b80 --- /dev/null +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16.S @@ -0,0 +1,197 @@ +// +// _AVX512_MNNGemmFloatUnit16.S +// MNN +// +// Created by MNN on 2020/12/07. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "../MNNAsmGlobal.h" +.text +.align 4 + +asm_function _AVX512_MNNGemmFloatUnit16 +//void _AVX512_MNNGemmFloatUnit16(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4) + +// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4 +// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter +pushq %rbp +movq %rsp, %rbp + +#ifdef WIN32 +movq 48(%rsp), %r10 +pushq %rdi +pushq %rsi +pushq %r12 +pushq %r13 +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +movq %r10, %r9 +#else +pushq %r12 +pushq %r13 +movq %r8, %r9 +#endif + +movq (%rcx), %r12 // aExtraStride +movq 40(%rcx), %r10 // bExtraStride +movq 24(%rcx), %r8 // cStride +movq 8(%rcx), %rcx // l + +cmpq $0, %r9 +je End + +// zmm8-zmm31: Dst +// zmm0-zmm3: Src +// zmm4-zmm7: W + +addq $3, %rcx +shrq $2, %rcx // l -> lC4 +movq %rsi, %r13 + +shlq $2, %r12 // aStride * 4 + +LoopDz: + movq %rcx, %r11 + movq %r13, %rsi + + subq $1, %r11 + + vbroadcastf32x4 (%rdx), %zmm4 + vbroadcastf32x4 16(%rdx), %zmm5 + vbroadcastf32x4 32(%rdx), %zmm6 + vbroadcastf32x4 48(%rdx), %zmm7 + + vmovups (%rsi), %zmm0 + vmovups 64(%rsi), %zmm1 + vmovups 128(%rsi), %zmm2 + vmovups 192(%rsi), %zmm3 + + vmulps %zmm0, %zmm4, %zmm8 + vmulps %zmm0, %zmm5, %zmm9 + vmulps %zmm0, %zmm6, %zmm10 + vmulps %zmm0, %zmm7, %zmm11 + + vmulps %zmm1, %zmm4, %zmm12 + vmulps %zmm1, %zmm5, %zmm13 + vmulps %zmm1, %zmm6, %zmm14 + vmulps %zmm1, %zmm7, %zmm15 + + vmulps %zmm2, %zmm4, %zmm16 + vmulps %zmm2, %zmm5, %zmm17 + vmulps %zmm2, %zmm6, %zmm18 + vmulps %zmm2, %zmm7, %zmm19 + + vmulps %zmm3, %zmm4, %zmm20 + vmulps %zmm3, %zmm5, %zmm21 + vmulps %zmm3, %zmm6, %zmm22 + vmulps %zmm3, %zmm7, %zmm23 + + addq $64, %rdx + addq %r12, %rsi + + cmpq $0, %r11 + je LoopSzEnd + + LoopSz: + vbroadcastf32x4 (%rdx), %zmm4 + vbroadcastf32x4 16(%rdx), %zmm5 + vbroadcastf32x4 32(%rdx), %zmm6 + vbroadcastf32x4 48(%rdx), %zmm7 + + vmovups (%rsi), %zmm0 + vmovups 64(%rsi), %zmm1 + vmovups 128(%rsi), %zmm2 + vmovups 192(%rsi), %zmm3 + + vfmadd231ps %zmm0, %zmm4, %zmm8 + vfmadd231ps %zmm0, %zmm5, %zmm9 + vfmadd231ps %zmm0, %zmm6, %zmm10 + vfmadd231ps %zmm0, %zmm7, %zmm11 + + vfmadd231ps %zmm1, %zmm4, %zmm12 + vfmadd231ps %zmm1, %zmm5, %zmm13 + vfmadd231ps %zmm1, %zmm6, %zmm14 + vfmadd231ps %zmm1, %zmm7, %zmm15 + + vfmadd231ps %zmm2, %zmm4, %zmm16 + vfmadd231ps %zmm2, %zmm5, %zmm17 + vfmadd231ps %zmm2, %zmm6, %zmm18 + vfmadd231ps %zmm2, %zmm7, %zmm19 + + vfmadd231ps %zmm3, %zmm4, %zmm20 + vfmadd231ps %zmm3, %zmm5, %zmm21 + vfmadd231ps %zmm3, %zmm6, %zmm22 + vfmadd231ps %zmm3, %zmm7, %zmm23 + + addq $64, %rdx + addq %r12, %rsi + + subq $1, %r11 + cmpq $0, %r11 + + jne LoopSz + LoopSzEnd: + +.macro HADD_SAVE x0, x1, x2, x3 + vextractf64x4 $0, \x0, %ymm0 + vextractf64x4 $1, \x0, %ymm1 + + vextractf64x4 $0, \x1, %ymm2 + vextractf64x4 $1, \x1, %ymm3 + + vextractf64x4 $0, \x2, %ymm4 + vextractf64x4 $1, \x2, %ymm5 + + vextractf64x4 $0, \x3, %ymm6 + vextractf64x4 $1, \x3, %ymm7 + + vhaddps %ymm2, %ymm0, %ymm0 + vhaddps %ymm6, %ymm4, %ymm4 + vhaddps %ymm3, %ymm1, %ymm1 + vhaddps %ymm7, %ymm5, %ymm5 + + vhaddps %ymm4, %ymm0, %ymm0 + vhaddps %ymm5, %ymm1, %ymm1 + + vmovups %ymm0, (%r11) + vmovups %ymm1, 32(%r11) +.endm + movq %rdi, %r11 + + HADD_SAVE %zmm8, %zmm9, %zmm10, %zmm11 + + addq $64, %r11 + HADD_SAVE %zmm12, %zmm13, %zmm14, %zmm15 + + addq $64, %r11 + HADD_SAVE %zmm16, %zmm17, %zmm18, %zmm19 + + addq $64, %r11 + HADD_SAVE %zmm20, %zmm21, %zmm22, %zmm23 + + addq %r10, %rdx + addq %r8, %rdi + subq $1, %r9 + testq %r9, %r9 + jne LoopDz + + +End: + +#ifdef WIN32 +popq %r13 +popq %r12 +popq %rsi +popq %rdi +popq %rbp +#else +popq %r13 +popq %r12 +popq %rbp +#endif + +retq + diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S index b820909d..502a9369 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnitMainFMA.S @@ -39,360 +39,176 @@ movq 40(%rcx), %r10 // bExtraStride movq 24(%rcx), %r8 // cStride movq 8(%rcx), %rcx // l +cmpq $0, %r9 +je End + // zmm8-zmm31: Dst // zmm0-zmm3: Src // zmm4-zmm7: W +addq $3, %rcx +shrq $2, %rcx // l -> lC4 movq %rsi, %r13 -cmpq $2, %r9 -jl LD1 -LoopDz2: +LoopDz: movq %rcx, %r11 movq %r13, %rsi subq $1, %r11 + vbroadcastf32x4 (%rdx), %zmm4 + vbroadcastf32x4 16(%rdx), %zmm5 + vbroadcastf32x4 32(%rdx), %zmm6 + vbroadcastf32x4 48(%rdx), %zmm7 + vmovups (%rsi), %zmm0 vmovups 64(%rsi), %zmm1 vmovups 128(%rsi), %zmm2 + vmovups 192(%rsi), %zmm3 - vbroadcastss (%rdx), %zmm4 vmulps %zmm0, %zmm4, %zmm8 - vmulps %zmm1, %zmm4, %zmm9 - vmulps %zmm2, %zmm4, %zmm10 + vmulps %zmm0, %zmm5, %zmm9 + vmulps %zmm0, %zmm6, %zmm10 + vmulps %zmm0, %zmm7, %zmm11 - vbroadcastss 4(%rdx), %zmm5 - vmulps %zmm0, %zmm5, %zmm11 - vmulps %zmm1, %zmm5, %zmm12 - vmulps %zmm2, %zmm5, %zmm13 + vmulps %zmm1, %zmm4, %zmm12 + vmulps %zmm1, %zmm5, %zmm13 + vmulps %zmm1, %zmm6, %zmm14 + vmulps %zmm1, %zmm7, %zmm15 - vbroadcastss 8(%rdx), %zmm6 - vmulps %zmm0, %zmm6, %zmm14 - vmulps %zmm1, %zmm6, %zmm15 - vmulps %zmm2, %zmm6, %zmm16 - - vbroadcastss 12(%rdx), %zmm7 - vmulps %zmm0, %zmm7, %zmm17 - vmulps %zmm1, %zmm7, %zmm18 + vmulps %zmm2, %zmm4, %zmm16 + vmulps %zmm2, %zmm5, %zmm17 + vmulps %zmm2, %zmm6, %zmm18 vmulps %zmm2, %zmm7, %zmm19 - vbroadcastss 16(%rdx), %zmm4 - vmulps %zmm0, %zmm4, %zmm20 - vmulps %zmm1, %zmm4, %zmm21 - vmulps %zmm2, %zmm4, %zmm22 + vmovups 256(%rsi), %zmm0 - vbroadcastss 20(%rdx), %zmm5 - vmulps %zmm0, %zmm5, %zmm23 - vmulps %zmm1, %zmm5, %zmm24 - vmulps %zmm2, %zmm5, %zmm25 + vmulps %zmm3, %zmm4, %zmm20 + vmulps %zmm3, %zmm5, %zmm21 + vmulps %zmm3, %zmm6, %zmm22 + vmulps %zmm3, %zmm7, %zmm23 - vbroadcastss 24(%rdx), %zmm6 + vmovups 320(%rsi), %zmm1 + + vmulps %zmm0, %zmm4, %zmm24 + vmulps %zmm0, %zmm5, %zmm25 vmulps %zmm0, %zmm6, %zmm26 - vmulps %zmm1, %zmm6, %zmm27 - vmulps %zmm2, %zmm6, %zmm28 + vmulps %zmm0, %zmm7, %zmm27 - vbroadcastss 28(%rdx), %zmm7 - vmulps %zmm0, %zmm7, %zmm29 - vmulps %zmm1, %zmm7, %zmm30 - vmulps %zmm2, %zmm7, %zmm31 + vmulps %zmm1, %zmm4, %zmm28 + vmulps %zmm1, %zmm5, %zmm29 + vmulps %zmm1, %zmm6, %zmm30 + vmulps %zmm1, %zmm7, %zmm31 - addq $32, %rdx - addq $192, %rsi + addq $64, %rdx + addq $384, %rsi + + cmpq $0, %r11 + je LoopSzEnd - cmpq $2, %r11 - jl LastS1 - LoopSz: + vbroadcastf32x4 (%rdx), %zmm4 + vbroadcastf32x4 16(%rdx), %zmm5 + vbroadcastf32x4 32(%rdx), %zmm6 + vbroadcastf32x4 48(%rdx), %zmm7 + vmovups (%rsi), %zmm0 vmovups 64(%rsi), %zmm1 vmovups 128(%rsi), %zmm2 + vmovups 192(%rsi), %zmm3 - vbroadcastss (%rdx), %zmm4 vfmadd231ps %zmm0, %zmm4, %zmm8 - vfmadd231ps %zmm1, %zmm4, %zmm9 - vfmadd231ps %zmm2, %zmm4, %zmm10 + vfmadd231ps %zmm0, %zmm5, %zmm9 + vfmadd231ps %zmm0, %zmm6, %zmm10 + vfmadd231ps %zmm0, %zmm7, %zmm11 - vbroadcastss 4(%rdx), %zmm5 - vfmadd231ps %zmm0, %zmm5, %zmm11 - vfmadd231ps %zmm1, %zmm5, %zmm12 - vfmadd231ps %zmm2, %zmm5, %zmm13 + vfmadd231ps %zmm1, %zmm4, %zmm12 + vfmadd231ps %zmm1, %zmm5, %zmm13 + vfmadd231ps %zmm1, %zmm6, %zmm14 + vfmadd231ps %zmm1, %zmm7, %zmm15 - vbroadcastss 8(%rdx), %zmm6 - vfmadd231ps %zmm0, %zmm6, %zmm14 - vfmadd231ps %zmm1, %zmm6, %zmm15 - vfmadd231ps %zmm2, %zmm6, %zmm16 - - vbroadcastss 12(%rdx), %zmm7 - vfmadd231ps %zmm0, %zmm7, %zmm17 - vfmadd231ps %zmm1, %zmm7, %zmm18 + vfmadd231ps %zmm2, %zmm4, %zmm16 + vfmadd231ps %zmm2, %zmm5, %zmm17 + vfmadd231ps %zmm2, %zmm6, %zmm18 vfmadd231ps %zmm2, %zmm7, %zmm19 - vbroadcastss 16(%rdx), %zmm4 - vfmadd231ps %zmm0, %zmm4, %zmm20 - vfmadd231ps %zmm1, %zmm4, %zmm21 - vfmadd231ps %zmm2, %zmm4, %zmm22 + vmovups 256(%rsi), %zmm0 - vbroadcastss 20(%rdx), %zmm5 - vfmadd231ps %zmm0, %zmm5, %zmm23 - vfmadd231ps %zmm1, %zmm5, %zmm24 - vfmadd231ps %zmm2, %zmm5, %zmm25 + vfmadd231ps %zmm3, %zmm4, %zmm20 + vfmadd231ps %zmm3, %zmm5, %zmm21 + vfmadd231ps %zmm3, %zmm6, %zmm22 + vfmadd231ps %zmm3, %zmm7, %zmm23 - vbroadcastss 24(%rdx), %zmm6 + vmovups 320(%rsi), %zmm1 + + vfmadd231ps %zmm0, %zmm4, %zmm24 + vfmadd231ps %zmm0, %zmm5, %zmm25 vfmadd231ps %zmm0, %zmm6, %zmm26 - vfmadd231ps %zmm1, %zmm6, %zmm27 - vfmadd231ps %zmm2, %zmm6, %zmm28 + vfmadd231ps %zmm0, %zmm7, %zmm27 - vbroadcastss 28(%rdx), %zmm7 - vfmadd231ps %zmm0, %zmm7, %zmm29 - vfmadd231ps %zmm1, %zmm7, %zmm30 - vfmadd231ps %zmm2, %zmm7, %zmm31 - - vmovups 192(%rsi), %zmm0 - vmovups 256(%rsi), %zmm1 - vmovups 320(%rsi), %zmm2 - - vbroadcastss 32(%rdx), %zmm4 - vfmadd231ps %zmm0, %zmm4, %zmm8 - vfmadd231ps %zmm1, %zmm4, %zmm9 - vfmadd231ps %zmm2, %zmm4, %zmm10 - - vbroadcastss 36(%rdx), %zmm5 - vfmadd231ps %zmm0, %zmm5, %zmm11 - vfmadd231ps %zmm1, %zmm5, %zmm12 - vfmadd231ps %zmm2, %zmm5, %zmm13 - - vbroadcastss 40(%rdx), %zmm6 - vfmadd231ps %zmm0, %zmm6, %zmm14 - vfmadd231ps %zmm1, %zmm6, %zmm15 - vfmadd231ps %zmm2, %zmm6, %zmm16 - - vbroadcastss 44(%rdx), %zmm7 - vfmadd231ps %zmm0, %zmm7, %zmm17 - vfmadd231ps %zmm1, %zmm7, %zmm18 - vfmadd231ps %zmm2, %zmm7, %zmm19 - - vbroadcastss 48(%rdx), %zmm4 - vfmadd231ps %zmm0, %zmm4, %zmm20 - vfmadd231ps %zmm1, %zmm4, %zmm21 - vfmadd231ps %zmm2, %zmm4, %zmm22 - - vbroadcastss 52(%rdx), %zmm5 - vfmadd231ps %zmm0, %zmm5, %zmm23 - vfmadd231ps %zmm1, %zmm5, %zmm24 - vfmadd231ps %zmm2, %zmm5, %zmm25 - - vbroadcastss 56(%rdx), %zmm6 - vfmadd231ps %zmm0, %zmm6, %zmm26 - vfmadd231ps %zmm1, %zmm6, %zmm27 - vfmadd231ps %zmm2, %zmm6, %zmm28 - - vbroadcastss 60(%rdx), %zmm7 - vfmadd231ps %zmm0, %zmm7, %zmm29 - vfmadd231ps %zmm1, %zmm7, %zmm30 - vfmadd231ps %zmm2, %zmm7, %zmm31 + vfmadd231ps %zmm1, %zmm4, %zmm28 + vfmadd231ps %zmm1, %zmm5, %zmm29 + vfmadd231ps %zmm1, %zmm6, %zmm30 + vfmadd231ps %zmm1, %zmm7, %zmm31 addq $64, %rdx addq $384, %rsi - subq $2, %r11 - cmpq $2, %r11 - jge LoopSz - LastS1: - cmpq $1, %r11 - jl Last - vmovups (%rsi), %zmm0 - vmovups 64(%rsi), %zmm1 - vmovups 128(%rsi), %zmm2 + subq $1, %r11 + cmpq $0, %r11 - vbroadcastss (%rdx), %zmm4 - vbroadcastss 4(%rdx), %zmm5 - vbroadcastss 8(%rdx), %zmm6 - vbroadcastss 12(%rdx), %zmm7 + jne LoopSz + LoopSzEnd: - vfmadd231ps %zmm0, %zmm4, %zmm8 - vfmadd231ps %zmm1, %zmm4, %zmm9 - vfmadd231ps %zmm2, %zmm4, %zmm10 +.macro HADD_SAVE x0, x1, x2, x3 + vextractf64x4 $0, \x0, %ymm0 + vextractf64x4 $1, \x0, %ymm1 - vfmadd231ps %zmm0, %zmm5, %zmm11 - vfmadd231ps %zmm1, %zmm5, %zmm12 - vfmadd231ps %zmm2, %zmm5, %zmm13 + vextractf64x4 $0, \x1, %ymm2 + vextractf64x4 $1, \x1, %ymm3 - vfmadd231ps %zmm0, %zmm6, %zmm14 - vfmadd231ps %zmm1, %zmm6, %zmm15 - vfmadd231ps %zmm2, %zmm6, %zmm16 + vextractf64x4 $0, \x2, %ymm4 + vextractf64x4 $1, \x2, %ymm5 - vfmadd231ps %zmm0, %zmm7, %zmm17 - vfmadd231ps %zmm1, %zmm7, %zmm18 - vfmadd231ps %zmm2, %zmm7, %zmm19 + vextractf64x4 $0, \x3, %ymm6 + vextractf64x4 $1, \x3, %ymm7 - vbroadcastss 16(%rdx), %zmm4 - vbroadcastss 20(%rdx), %zmm5 - vbroadcastss 24(%rdx), %zmm6 - vbroadcastss 28(%rdx), %zmm7 + vhaddps %ymm2, %ymm0, %ymm0 + vhaddps %ymm6, %ymm4, %ymm4 + vhaddps %ymm3, %ymm1, %ymm1 + vhaddps %ymm7, %ymm5, %ymm5 - vfmadd231ps %zmm0, %zmm4, %zmm20 - vfmadd231ps %zmm1, %zmm4, %zmm21 - vfmadd231ps %zmm2, %zmm4, %zmm22 - - vfmadd231ps %zmm0, %zmm5, %zmm23 - vfmadd231ps %zmm1, %zmm5, %zmm24 - vfmadd231ps %zmm2, %zmm5, %zmm25 - - vfmadd231ps %zmm0, %zmm6, %zmm26 - vfmadd231ps %zmm1, %zmm6, %zmm27 - vfmadd231ps %zmm2, %zmm6, %zmm28 - - vfmadd231ps %zmm0, %zmm7, %zmm29 - vfmadd231ps %zmm1, %zmm7, %zmm30 - vfmadd231ps %zmm2, %zmm7, %zmm31 - - addq $32, %rdx - - Last: - -.macro TRANSPOSE_SAVE x0, x1, x2, x3 - vpunpckldq \x1, \x0, %zmm0 - vpunpckldq \x3, \x2, %zmm2 - vpunpckhdq \x1, \x0, %zmm1 - vpunpckhdq \x3, \x2, %zmm3 - - vpunpcklqdq %zmm2, %zmm0, \x0 - vpunpckhqdq %zmm2, %zmm0, \x1 - vpunpcklqdq %zmm3, %zmm1, \x2 - vpunpckhqdq %zmm3, %zmm1, \x3 - - vextractf32x8 $0, \x0, %ymm0 - vextractf32x8 $0, \x1, %ymm1 - vperm2f128 $32, %ymm1, %ymm0, %ymm4 - vperm2f128 $49, %ymm1, %ymm0, %ymm5 - vextractf32x8 $0, \x2, %ymm2 - vextractf32x8 $0, \x3, %ymm3 - vmovups %ymm4, (%r11) - vmovups %ymm5, 64(%r11) - vperm2f128 $32, %ymm3, %ymm2, %ymm6 - vperm2f128 $49, %ymm3, %ymm2, %ymm7 - vmovups %ymm6, 32(%r11) - vmovups %ymm7, 96(%r11) - - vextractf32x8 $1, \x0, %ymm0 - vextractf32x8 $1, \x1, %ymm1 - vperm2f128 $32, %ymm1, %ymm0, %ymm4 - vperm2f128 $49, %ymm1, %ymm0, %ymm5 - vextractf32x8 $1, \x2, %ymm2 - vextractf32x8 $1, \x3, %ymm3 - vmovups %ymm4, 128(%r11) - vmovups %ymm5, 192(%r11) - vperm2f128 $32, %ymm3, %ymm2, %ymm6 - vperm2f128 $49, %ymm3, %ymm2, %ymm7 - vmovups %ymm6, 160(%r11) - vmovups %ymm7, 224(%r11) + vhaddps %ymm4, %ymm0, %ymm0 + vhaddps %ymm5, %ymm1, %ymm1 + vmovups %ymm0, (%r11) + vmovups %ymm1, 32(%r11) .endm movq %rdi, %r11 - TRANSPOSE_SAVE %zmm8, %zmm11, %zmm14, %zmm17 - addq $256, %r11 - TRANSPOSE_SAVE %zmm9, %zmm12, %zmm15, %zmm18 - addq $256, %r11 - TRANSPOSE_SAVE %zmm10, %zmm13, %zmm16, %zmm19 - addq %r8, %rdi + HADD_SAVE %zmm8, %zmm9, %zmm10, %zmm11 - movq %rdi, %r11 - TRANSPOSE_SAVE %zmm20, %zmm23, %zmm26, %zmm29 - addq $256, %r11 - TRANSPOSE_SAVE %zmm21, %zmm24, %zmm27, %zmm30 - addq $256, %r11 - TRANSPOSE_SAVE %zmm22, %zmm25, %zmm28, %zmm31 + addq $64, %r11 + HADD_SAVE %zmm12, %zmm13, %zmm14, %zmm15 - addq %r8, %rdi + addq $64, %r11 + HADD_SAVE %zmm16, %zmm17, %zmm18, %zmm19 + + addq $64, %r11 + HADD_SAVE %zmm20, %zmm21, %zmm22, %zmm23 + + addq $64, %r11 + HADD_SAVE %zmm24, %zmm25, %zmm26, %zmm27 + + addq $64, %r11 + HADD_SAVE %zmm28, %zmm29, %zmm30, %zmm31 addq %r10, %rdx + addq %r8, %rdi + subq $1, %r9 + testq %r9, %r9 + jne LoopDz - subq $2, %r9 - cmpq $2, %r9 - jge LoopDz2 - -LD1: -cmpq $1, %r9 -jl End - -movq %rcx, %r11 -movq %r13, %rsi - -subq $1, %r11 - -vmovups (%rsi), %zmm0 -vmovups 64(%rsi), %zmm1 -vmovups 128(%rsi), %zmm2 - -vbroadcastss (%rdx), %zmm4 -vbroadcastss 4(%rdx), %zmm5 -vbroadcastss 8(%rdx), %zmm6 -vbroadcastss 12(%rdx), %zmm7 - -vmulps %zmm0, %zmm4, %zmm8 -vmulps %zmm1, %zmm4, %zmm9 -vmulps %zmm2, %zmm4, %zmm10 - -vmulps %zmm0, %zmm5, %zmm11 -vmulps %zmm1, %zmm5, %zmm12 -vmulps %zmm2, %zmm5, %zmm13 - -vmulps %zmm0, %zmm6, %zmm14 -vmulps %zmm1, %zmm6, %zmm15 -vmulps %zmm2, %zmm6, %zmm16 - -vmulps %zmm0, %zmm7, %zmm17 -vmulps %zmm1, %zmm7, %zmm18 -vmulps %zmm2, %zmm7, %zmm19 - -addq $32, %rdx -addq $192, %rsi - -cmpq $1, %r11 -jl LastLD1 - -LoopSzLD1: - vmovups (%rsi), %zmm0 - vmovups 64(%rsi), %zmm1 - vmovups 128(%rsi), %zmm2 - - vbroadcastss (%rdx), %zmm4 - vbroadcastss 4(%rdx), %zmm5 - vbroadcastss 8(%rdx), %zmm6 - vbroadcastss 12(%rdx), %zmm7 - - vfmadd231ps %zmm0, %zmm4, %zmm8 - vfmadd231ps %zmm1, %zmm4, %zmm9 - vfmadd231ps %zmm2, %zmm4, %zmm10 - - vfmadd231ps %zmm0, %zmm5, %zmm11 - vfmadd231ps %zmm1, %zmm5, %zmm12 - vfmadd231ps %zmm2, %zmm5, %zmm13 - - vfmadd231ps %zmm0, %zmm6, %zmm14 - vfmadd231ps %zmm1, %zmm6, %zmm15 - vfmadd231ps %zmm2, %zmm6, %zmm16 - - vfmadd231ps %zmm0, %zmm7, %zmm17 - vfmadd231ps %zmm1, %zmm7, %zmm18 - vfmadd231ps %zmm2, %zmm7, %zmm19 - - addq $32, %rdx - addq $192, %rsi - subq $1, %r11 - cmpq $1, %r11 - jge LoopSzLD1 - -LastLD1: - -movq %rdi, %r11 -TRANSPOSE_SAVE %zmm8, %zmm11, %zmm14, %zmm17 -addq $256, %r11 -TRANSPOSE_SAVE %zmm9, %zmm12, %zmm15, %zmm18 -addq $256, %r11 -TRANSPOSE_SAVE %zmm10, %zmm13, %zmm16, %zmm19 End: diff --git a/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp new file mode 100644 index 00000000..7e241a0c --- /dev/null +++ b/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp @@ -0,0 +1,50 @@ +// +// FunctionSummary.hpp +// MNN +// +// Created by MNN on 2019/08/25. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#if defined(_MSC_VER) +#include +#else +#include +#endif +#include +#include + +#ifndef _MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + __m128 tmp3, tmp2, tmp1, tmp0; \ + tmp0 = _mm_unpacklo_ps((row0), (row1)); \ + tmp2 = _mm_unpacklo_ps((row2), (row3)); \ + tmp1 = _mm_unpackhi_ps((row0), (row1)); \ + tmp3 = _mm_unpackhi_ps((row2), (row3)); \ + (row0) = _mm_movelh_ps(tmp0, tmp2); \ + (row1) = _mm_movehl_ps(tmp2, tmp0); \ + (row2) = _mm_movelh_ps(tmp1, tmp3); \ + (row3) = _mm_movehl_ps(tmp3, tmp1); \ + } while (0) +#endif +#include "backend/cpu/compute/Int8FunctionsOpt.h" +#include "backend/cpu/compute/CommonOptFunction.h" + +// ========= CommonOptFunction.cpp =========== +extern "C" { +void _AVX_MNNGemmFloatCommonFMA_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, + size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset); + +void _AVX_MNNGemmFloatUnitFMA_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad, + size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset); + +void _AVX_MNNPackedMatMulFMA(float* C, const float* A, const float* B, const size_t* parameter, + const float* postParameters, const float* bias); +void _AVX_MNNPackedMatMulRemainFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); +void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId); +void _AVX_MNNPackedMatMulFMA_BF16(float* C, const float* A, const float* B, const size_t* parameter, + const float* postParameters, const float* bias); +void _AVX_MNNPackedMatMulRemainFMA_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); + +} diff --git a/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMA.cpp b/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMA.cpp new file mode 100644 index 00000000..8b8e3048 --- /dev/null +++ b/source/backend/cpu/x86_x64/avxfma/GemmAVX2FMA.cpp @@ -0,0 +1,99 @@ +// +// GemmAVX2FMA.cpp +// MNN +// +// Created by MNN on b'2020/09/22'. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "FunctionSummary.hpp" +#include "../avx/GemmCommon.hpp" +#include "core/Macro.h" +#define MNNAVXFMA _mm256_fmadd_ps +#define MNNSSEFMA _mm_fmadd_ps +#define BROAD_LOAD(x) _mm256_broadcast_ss(x) +#define BROAD_LOAD_4(x) _mm_broadcast_ss(x) +#define LOAD8(x) _mm256_loadu_ps(x) +#define LOAD4(x) _mm_loadu_ps(x) +#define STORE_4(d, x) _mm_store_ps(d, x) +#define STORE_8(d, x) _mm256_storeu_ps(d, x) + +#include "../avx/GemmFunction.hpp" +#ifdef MNN_X86_USE_ASM +extern "C" { +void _AVX_MNNGemmFloatUnitMainFMA(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4); +} +#endif + +void _AVX_MNNPackedMatMulFMA(float* C, const float* A, const float* B, const size_t* parameter, + const float* postParameters, const float* bias) { + auto h = parameter[2]; + auto hC4 = UP_DIV(h, 4); + auto cStride = parameter[3] / sizeof(float); +#ifdef MNN_X86_USE_ASM + _AVX_MNNGemmFloatUnitMainFMA(C, A, B, parameter, hC4); +#else + _AVX_MNNPackedMatMul_24(C, A, B, parameter); +#endif + AVX2GemmPostTreat(C, 24, parameter, postParameters, bias); +} + +void _AVX_MNNPackedMatMulRemainFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) { + _AVX_MNNPackednMatMulRemainCommon(C, A, B, eSize, parameter); + AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias); +} + +void _AVX_MNNComputeMatMulForE_1FMA(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) { + auto l = param->l; + auto h = param->h; + auto numberThread = param->numberThread; + auto lC4 = l / 8; + auto lR = lC4 * 8; + if (param->BTranspose) { + for (int y=tId; y 0) { + for (int y = 0; y < hC4; ++y) { + auto biasValue = LOAD4(bias + 4 * y); + auto bias2 = _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128(biasValue))); + auto dst = C + y * cStride; + for (int x = 0; x < eC2; ++x) { + auto sum = _mm256_add_ps(bias2, LOAD8(dst)); + sum = _mm256_max_ps(sum, minV2); + sum = _mm256_min_ps(sum, maxV2); + STORE_8(dst, sum); + dst += 8; + } + auto sum = _mm_add_ps(biasValue, LOAD4(dst)); + sum = _mm_max_ps(sum, minValue); + sum = _mm_min_ps(sum, maxValue); + STORE_4(dst, sum); + } + } else { + for (int y = 0; y < hC4; ++y) { + auto biasValue = LOAD4(bias + 4 * y); + auto bias2 = _mm256_castsi256_ps(_mm256_broadcastsi128_si256(_mm_castps_si128(biasValue))); + auto dst = C + y * cStride; + for (int x = 0; x < eC2; ++x) { + auto sum = _mm256_add_ps(bias2, LOAD8(dst)); + sum = _mm256_max_ps(sum, minV2); + sum = _mm256_min_ps(sum, maxV2); + STORE_8(dst, sum); + dst += 8; + } + } + } + } else { + if (eR > 0) { + for (int y = 0; y < hC4; ++y) { + auto dst = C + y * cStride; + for (int x = 0; x < eC2; ++x) { + auto sum = LOAD8(dst); + sum = _mm256_max_ps(sum, minV2); + sum = _mm256_min_ps(sum, maxV2); + STORE_8(dst, sum); + dst += 8; + } + auto sum = LOAD4(dst); + sum = _mm_max_ps(sum, minValue); + sum = _mm_min_ps(sum, maxValue); + STORE_4(dst, sum); + } + } else { + for (int y = 0; y < hC4; ++y) { + auto dst = C + y * cStride; + for (int x = 0; x < eC2; ++x) { + auto sum = LOAD8(dst); + sum = _mm256_max_ps(sum, minV2); + sum = _mm256_min_ps(sum, maxV2); + STORE_8(dst, sum); + dst += 8; + } + } + } + } +} + +void _AVX_MNNPackedMatMulFMA_BF16(float* C, const float* A, const float* B, const size_t* parameter, + const float* postParameters, const float* bias) { + _AVX_MNNPackedMatMul_3((int16_t*)C, (const int16_t*)A, (const int16_t*)B, parameter); + AVX2GemmPostTreatBF16(C, 3, parameter, postParameters, bias); +} +void _AVX_MNNPackedMatMulRemainFMA_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) { + _AVX_MNNPackednMatMulRemainCommon((int16_t*)C, (const int16_t*)A, (const int16_t*)B, eSize, parameter); + AVX2GemmPostTreatBF16(C, eSize, parameter, postParameters, bias); +} +#endif diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmFloatUnitMainFMA.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S similarity index 100% rename from source/backend/cpu/x86_x64/avx/_AVX_MNNGemmFloatUnitMainFMA.S rename to source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S diff --git a/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp b/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp index 79cf6efc..71e4ead1 100644 --- a/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp +++ b/source/backend/cpu/x86_x64/sse/CommonOptFunction.cpp @@ -11,6 +11,25 @@ #include #include "core/Macro.h" #include "FunctionSummary.hpp" +void _SSE_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) { + auto minF = _mm_set1_ps(parameters[2]); + auto maxF = _mm_set1_ps(parameters[3]); + auto beta = _mm_set1_ps(parameters[1]); + for (int y = 0; y < height; ++y) { + auto a = A + aStride * y; + auto b = B + 4 * y; + auto bv = _mm_loadu_ps(b); + auto c = C + cStride * y; + for (int x = 0; x < width; ++x) { + auto av = _mm_loadu_ps(a + 4 * x); + auto cv = _mm_add_ps(av, _mm_mul_ps(bv, beta)); + cv = _mm_min_ps(cv, maxF); + cv = _mm_max_ps(cv, minF); + _mm_storeu_ps(c + 4 * x, cv); + } + } +} + void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) { int countC16 = count / 16; int countR = count % 16; @@ -30,45 +49,6 @@ void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) { } } -void _SSE_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - for (int z = 0; z < biasNumber; ++z) { - auto biasV = _mm_loadu_ps(bias + 4 * z); - float* dst_z = dst + planeNumber * 4 * z; - for (int p = 0; p < planeNumber; ++p) { - auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * p), biasV); - _mm_storeu_ps(dst_z + 4 * p, dstV); - } - } -} - -void _SSE_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - auto maxV = _mm_set1_ps(0.0f); - for (int z = 0; z < biasNumber; ++z) { - auto biasV = _mm_loadu_ps(bias + 4 * z); - float* dst_z = dst + planeNumber * 4 * z; - for (int p = 0; p < planeNumber; ++p) { - auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * p), biasV); - dstV = _mm_max_ps(dstV, maxV); - _mm_storeu_ps(dst_z + 4 * p, dstV); - } - } -} - -void _SSE_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) { - auto maxV = _mm_set1_ps(0.0f); - auto minV = _mm_set1_ps(6.0f); - for (int z = 0; z < biasNumber; ++z) { - auto biasV = _mm_loadu_ps(bias + 4 * z); - float* dst_z = dst + planeNumber * 4 * z; - for (int p = 0; p < planeNumber; ++p) { - auto dstV = _mm_add_ps(_mm_loadu_ps(dst_z + 4 * p), biasV); - dstV = _mm_max_ps(dstV, maxV); - dstV = _mm_min_ps(dstV, minV); - _mm_storeu_ps(dst_z + 4 * p, dstV); - } - } -} - void _SSE_MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) { for (int i = 0; i < count; ++i) { auto s = source + i * srcStride; @@ -85,6 +65,14 @@ void _SSE_MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, } } +void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) { + auto zero = _mm_set1_epi8(0); + for (int i = 0; i < size; i+=16) { + auto x = _mm_castps_si128(_mm_loadu_ps((const float*)(src + i))); + _mm_storeu_ps((float*)(dst + i), _mm_castsi128_ps(_mm_max_epi8(x, zero))); + } +} + void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) { auto zero = _mm_set1_ps(0.0f); for (int j = 0; j < depthQuad; j++) { @@ -101,6 +89,16 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo } } +void _SSE_MNNHardSwish(float* dst, const float* src, size_t size) { + auto zero = _mm_set1_ps(0.f); + auto three = _mm_set1_ps(3.f); + auto six = _mm_set1_ps(6.f); + for (int i = 0; i < size; i++) { + auto x = _mm_loadu_ps(src + 4 * i); + _mm_storeu_ps(dst + 4 * i, _mm_div_ps(_mm_mul_ps(x, _mm_min_ps(_mm_max_ps(_mm_add_ps(x, three), zero), six)), six)); + } +} + void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) { @@ -241,6 +239,7 @@ void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl __m128i zero = _mm_set1_epi32(0); __m128 minValue = _mm_set1_ps(minV); __m128 maxValue = _mm_set1_ps(maxV); + __m128 zeroPointValue = _mm_set1_ps(zeroPoint); __m128 plus = _mm_set1_ps(0.5f); __m128 minus = _mm_set1_ps(-0.5f); __m128 scaleValue = _mm_loadu_ps(scalep); @@ -249,6 +248,7 @@ void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl for (int i = 0; i < sizeQuad; ++i) { __m128 f0 = _mm_loadu_ps(src + 4 * i); f0 = _mm_mul_ps(f0, scaleValue); + f0 = _mm_add_ps(f0, zeroPointValue); f0 = _mm_min_ps(f0, maxValue); f0 = _mm_max_ps(f0, minValue); auto m0 = _mm_cmplt_ps(f0, _mm_castsi128_ps(zero)); @@ -268,6 +268,7 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, auto sizeRemain = sizeQuad % 4; __m128i zero = _mm_set1_epi32(0); __m128 scaleValue = _mm_loadu_ps(scale); + __m128i zeroPointValue = _mm_set1_epi32(zeroPoint); for (int i = 0; i < sizeC4; ++i) { auto s = _mm_castps_si128(_mm_loadu_ps((const float*)(src))); auto s0_16 = _mm_srai_epi16(_mm_unpacklo_epi8(zero, s), 8); @@ -276,6 +277,10 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, auto s1_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s0_16), 16); auto s2_32 = _mm_srai_epi32(_mm_unpacklo_epi16(zero, s1_16), 16); auto s3_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s1_16), 16); + s0_32 = _mm_sub_epi32(s0_32, zeroPointValue); + s1_32 = _mm_sub_epi32(s1_32, zeroPointValue); + s2_32 = _mm_sub_epi32(s2_32, zeroPointValue); + s3_32 = _mm_sub_epi32(s3_32, zeroPointValue); auto s0_f = _mm_cvtepi32_ps(s0_32); auto s1_f = _mm_cvtepi32_ps(s1_32); auto s2_f = _mm_cvtepi32_ps(s2_32); @@ -297,6 +302,10 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, auto s1_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s0_16), 16); auto s2_32 = _mm_srai_epi32(_mm_unpacklo_epi16(zero, s1_16), 16); auto s3_32 = _mm_srai_epi32(_mm_unpackhi_epi16(zero, s1_16), 16); + s0_32 = _mm_sub_epi32(s0_32, zeroPointValue); + s1_32 = _mm_sub_epi32(s1_32, zeroPointValue); + s2_32 = _mm_sub_epi32(s2_32, zeroPointValue); + s3_32 = _mm_sub_epi32(s3_32, zeroPointValue); auto s0_f = _mm_cvtepi32_ps(s0_32); auto s1_f = _mm_cvtepi32_ps(s1_32); auto s2_f = _mm_cvtepi32_ps(s2_32); @@ -709,3 +718,7 @@ void MNNInt8ToUInt8(void* ptr, int count) { } } } + +void MNNCoreFunctionInit() { + +} diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp index 47f8a4f0..a7d7725f 100644 --- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp +++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp @@ -30,12 +30,7 @@ (row3) = _mm_movehl_ps(tmp3, tmp1); \ } while (0) #endif - -void _SSE_MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber); - -void _SSE_MNNAddBiasRelu(float* dst, const float* bias, size_t planeNumber, size_t biasNumber); - -void _SSE_MNNAddBiasRelu6(float* dst, const float* bias, size_t planeNumber, size_t biasNumber); +void _SSE_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters); void _SSE_MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count); @@ -61,14 +56,16 @@ void _SSE_MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad); +void _SSE_MNNHardSwish(float* dst, const float* src, size_t size); + void _SSE_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t length, size_t hSub); -void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, float* cache, +void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); void _SSE_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, - float* cache, const float* postParameters, const float* bias); -void _SSE_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal); + const float* postParameters, const float* bias); +void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el); void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep); @@ -83,3 +80,6 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, void _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step); void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count); void _SSE_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId); + +void _SSE_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose); +void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size); diff --git a/source/backend/cpu/x86_x64/sse/GemmCommon.cpp b/source/backend/cpu/x86_x64/sse/GemmCommon.cpp index 33c4615e..15e34dbe 100644 --- a/source/backend/cpu/x86_x64/sse/GemmCommon.cpp +++ b/source/backend/cpu/x86_x64/sse/GemmCommon.cpp @@ -30,91 +30,104 @@ bool _SSE_MNNReorder4x4ByPlatform(float* dst, size_t number) { return true; } -void _SSE_MNNPackC4ForMatMul_A(float* dest, const float* source, size_t e, size_t l, size_t eReal) { - const int pack = 12; - const int mid = 1; // Deprecate - const int packC4 = pack / 4; - auto ePack = e / pack; - auto lC4 = l / 4; - auto lDiv = UP_DIV(l, 4); - auto eRemain = ePack * pack; - auto lRemain = lC4 * 4; - auto lRes = l - lRemain; - for (int y = 0; y < ePack; ++y) { - auto dstY = dest + y * l * pack; - auto srcY = source + y * pack * 4; - for (int x = 0; x < lC4; ++x) { - auto srcX = srcY + x * 4 * eReal; - auto dstX = dstY + x * pack * 4; - auto s00 = _mm_loadu_ps(srcX + 0 * 4); - auto s01 = _mm_loadu_ps(srcX + 1 * 4); - auto s02 = _mm_loadu_ps(srcX + 2 * 4); - auto s03 = _mm_loadu_ps(srcX + 3 * 4); - auto s10 = _mm_loadu_ps(srcX + 4 * 4); - auto s11 = _mm_loadu_ps(srcX + 5 * 4); - auto s12 = _mm_loadu_ps(srcX + 6 * 4); - auto s13 = _mm_loadu_ps(srcX + 7 * 4); - auto s20 = _mm_loadu_ps(srcX + 8 * 4); - auto s21 = _mm_loadu_ps(srcX + 9 * 4); - auto s22 = _mm_loadu_ps(srcX + 10 * 4); - auto s23 = _mm_loadu_ps(srcX + 11 * 4); +void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) { + int number = info[0]; + int eReal = info[1]; + int eDest = info[2]; + int xStride = info[3]; + int xS4 = xStride * 4; + for (int n=0; n -#else -#include -#endif -#include - -static __m128 merge(__m128 d0, __m128 d1, __m128 d2, __m128 d3) { - auto d00 = _mm_hadd_ps(d0, d1); - auto d01 = _mm_hadd_ps(d2, d3); - return _mm_hadd_ps(d00, d01); -} - -#define COMPUTE(i) \ - { \ - d0##i = _mm_add_ps(_mm_mul_ps(w##i, s0), d0##i); \ - d1##i = _mm_add_ps(_mm_mul_ps(w##i, s1), d1##i); \ - d2##i = _mm_add_ps(_mm_mul_ps(w##i, s2), d2##i); \ - d3##i = _mm_add_ps(_mm_mul_ps(w##i, s3), d3##i); \ - } -#define STORE(i) _mm_storeu_ps(dst_x + 4 * i, merge(d##i##0, d##i##1, d##i##2, d##i##3)); - -void _SSE_MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, - size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset) { - auto src_depth_step = 4 * width; - int wC4 = width / 4; - int w4End = wC4 * 4; - for (int dz = 0; dz < dst_depth_quad; ++dz) { - float* dst_z = dst + dz * dst_step; - auto weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset); - for (int dx = 0; dx < wC4; ++dx) { - float* dst_x = dst_z + dx * 16; - const float* src_dx = src + 16 * dx; - auto iw0 = _mm_loadu_ps(weight_dz + 4 * 0); - auto iw1 = _mm_loadu_ps(weight_dz + 4 * 1); - auto iw2 = _mm_loadu_ps(weight_dz + 4 * 2); - auto iw3 = _mm_loadu_ps(weight_dz + 4 * 3); - auto is0 = _mm_loadu_ps(src_dx + 4 * 0); - auto is1 = _mm_loadu_ps(src_dx + 4 * 1); - auto is2 = _mm_loadu_ps(src_dx + 4 * 2); - auto is3 = _mm_loadu_ps(src_dx + 4 * 3); - - auto d00 = _mm_mul_ps(is0, iw0); - auto d01 = _mm_mul_ps(is0, iw1); - auto d02 = _mm_mul_ps(is0, iw2); - auto d03 = _mm_mul_ps(is0, iw3); - - auto d10 = _mm_mul_ps(is1, iw0); - auto d11 = _mm_mul_ps(is1, iw1); - auto d12 = _mm_mul_ps(is1, iw2); - auto d13 = _mm_mul_ps(is1, iw3); - - auto d20 = _mm_mul_ps(is2, iw0); - auto d21 = _mm_mul_ps(is2, iw1); - auto d22 = _mm_mul_ps(is2, iw2); - auto d23 = _mm_mul_ps(is2, iw3); - - auto d30 = _mm_mul_ps(is3, iw0); - auto d31 = _mm_mul_ps(is3, iw1); - auto d32 = _mm_mul_ps(is3, iw2); - auto d33 = _mm_mul_ps(is3, iw3); - - for (int sz = 1; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - const float* weight_z = weight_dz + sz * 16; - auto w0 = _mm_loadu_ps(weight_z + 4 * 0); - auto w1 = _mm_loadu_ps(weight_z + 4 * 1); - auto w2 = _mm_loadu_ps(weight_z + 4 * 2); - auto w3 = _mm_loadu_ps(weight_z + 4 * 3); - auto s0 = _mm_loadu_ps(src_z + 4 * 0); - auto s1 = _mm_loadu_ps(src_z + 4 * 1); - auto s2 = _mm_loadu_ps(src_z + 4 * 2); - auto s3 = _mm_loadu_ps(src_z + 4 * 3); - - COMPUTE(0); - COMPUTE(1); - COMPUTE(2); - COMPUTE(3); - } - STORE(0); - STORE(1); - STORE(2); - STORE(3); - } - for (int dx = w4End; dx < width; ++dx) { - float* dst_x = dst_z + dx * 4; - auto d0 = _mm_set1_ps(0.0f); - auto d1 = _mm_set1_ps(0.0f); - auto d2 = _mm_set1_ps(0.0f); - auto d3 = _mm_set1_ps(0.0f); - - const float* src_dx = src + 4 * dx; - for (int sz = 0; sz < src_depth_quad; ++sz) { - const float* src_z = src_dx + sz * src_depth_step; - const float* weight_z = weight_dz + sz * 16; - auto w0 = _mm_loadu_ps(weight_z + 4 * 0); - auto w1 = _mm_loadu_ps(weight_z + 4 * 1); - auto w2 = _mm_loadu_ps(weight_z + 4 * 2); - auto w3 = _mm_loadu_ps(weight_z + 4 * 3); - auto s = _mm_loadu_ps(src_z); - - auto sw0 = _mm_mul_ps(s, w0); - auto sw1 = _mm_mul_ps(s, w1); - auto sw2 = _mm_mul_ps(s, w2); - auto sw3 = _mm_mul_ps(s, w3); - d0 = _mm_add_ps(d0, sw0); - d1 = _mm_add_ps(d1, sw1); - d2 = _mm_add_ps(d2, sw2); - d3 = _mm_add_ps(d3, sw3); - } - _mm_storeu_ps(dst_x, merge(d0, d1, d2, d3)); - } - } -} - -void _SSE_MNNGemmFloatUnit_4(float* dst, const float* src, const float* weight, size_t src_depth_quad, size_t dst_step, - size_t dst_depth_quad, size_t weight_depth_offset) { - return _SSE_MNNGemmFloatCommon_4(dst, src, weight, src_depth_quad, dst_step, dst_depth_quad, 8, - weight_depth_offset); -} diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp index fffde4d3..2e113567 100644 --- a/source/backend/cuda/core/CUDABackend.cpp +++ b/source/backend/cuda/core/CUDABackend.cpp @@ -58,7 +58,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B CUDARuntimeWrapper::~CUDARuntimeWrapper() { // Do nothing } -Backend* CUDARuntimeWrapper::onCreate() const { +Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const { return new CUDABackend(mBufferPool, mCUDARuntime); } @@ -147,11 +147,8 @@ std::pair CUDABackend::onMeasure(const std::vector& inputs return std::make_pair(0.0f, false); } const float defaultScheduleTime = 0.05f; -#ifndef MNN_BUILD_MINI - auto flops = SizeComputer::computeFlops(op, inputs, outputs); -#else + // FIXME: Compute in future auto flops = 0.0f; -#endif auto computeFlops = mCUDARuntime->flops(); return std::make_pair(defaultScheduleTime + flops / 1024.0f / computeFlops * 1000.0f, true); } @@ -214,28 +211,28 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) auto needSize = realSize(srcTensor) * srcTensor->getType().bytes(); std::shared_ptr srcTempTensor; std::shared_ptr dstTempTensor; - if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { - srcTempTensor.reset(new Tensor(srcTensor, Tensor::CAFFE, true)); - MNNCPUCopyBuffer(srcTensor, srcTempTensor.get()); - srcTensor = srcTempTensor.get(); - } - if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { - dstTempTensor.reset(new Tensor(dstTensor, Tensor::CAFFE, true), [dstTensor](void* ptr) { - auto src = (Tensor*)ptr; - MNNCPUCopyBuffer(src, dstTensor); - delete src; - }); - dstTensor = dstTempTensor.get(); - } + if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) { mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize, - MNNMemcpyDeviceToDevice, true); + MNNMemcpyDeviceToDevice, true); } if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) { - mCUDARuntime->memcpy(dstTensor->host(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost, + if(dstDimensionFormat == MNN_DATA_FORMAT_NCHW) { + mCUDARuntime->memcpy(dstTensor->host(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost, true); + } else { + dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true)); + mCUDARuntime->memcpy(dstTempTensor->host(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost, + true); + MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor); + } } if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) { + if (srcDimensionFormat != MNN_DATA_FORMAT_NCHW) { + srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true)); + MNNCPUCopyBuffer(srcTensor, srcTempTensor.get()); + srcTensor = srcTempTensor.get(); + } mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host(), needSize, MNNMemcpyHostToDevice, true); } diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp index f04f2325..c31197c0 100644 --- a/source/backend/cuda/core/CUDABackend.hpp +++ b/source/backend/cuda/core/CUDABackend.hpp @@ -23,7 +23,7 @@ class MNN_PUBLIC CUDARuntimeWrapper : public Runtime { public: CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power); virtual ~CUDARuntimeWrapper(); - virtual Backend *onCreate() const override; + virtual Backend *onCreate(const BackendConfig* config) const override; virtual void onGabageCollect(int level) override; bool isCreateError() const { return mIsCreateError; diff --git a/source/backend/cuda/execution/UnaryExecution.cu b/source/backend/cuda/execution/UnaryExecution.cu index 06fe3842..6c0f910b 100644 --- a/source/backend/cuda/execution/UnaryExecution.cu +++ b/source/backend/cuda/execution/UnaryExecution.cu @@ -213,6 +213,19 @@ __global__ void ASINH(T *input, T *output, size_t count) { } return; } +template +__global__ void HARDSWISH(T *input, T *output, size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + if (input[i] <= -3) { + output[i] = 0; + } else if (input[i] >= 3) { + output[i] = input[i]; + } else { + output[i] = input[i] * (input[i] + 3) / 6; + } + } + return; +} void callUnary(void *input, void *output, size_t count, MNN::CUDARuntime* runtime, halide_type_t data_type, MNN::UnaryOpOperation op_type) @@ -249,6 +262,7 @@ void callUnary(void *input, void *output, size_t count, MNN::CUDARuntime* runtim COMPUTE(ROUND); COMPUTE(SINH); COMPUTE(ASINH); + COMPUTE(HARDSWISH); //case CudaUnaryOpOperation_BNLL: //case CudaUnaryOpOperation_ERF: diff --git a/source/backend/hiai/backend/NPUBackend.cpp b/source/backend/hiai/backend/NPUBackend.cpp index 0b4c49e9..a89b0989 100644 --- a/source/backend/hiai/backend/NPUBackend.cpp +++ b/source/backend/hiai/backend/NPUBackend.cpp @@ -161,7 +161,6 @@ namespace MNN { return NO_ERROR; } - printf("batch:%d,channel:%d,area:%d \n",batch,channel,area); if (MNN_DATA_FORMAT_NHWC == source && MNN_DATA_FORMAT_NCHW == dest) { if (bitLength != 4) { return NOT_SUPPORT; @@ -241,7 +240,6 @@ namespace MNN { shared_ptr desc = make_shared(model_name, 3, 0, 0, 0); desc->SetModelBuffer(buffer->GetMemBufferData(), buffer->GetMemBufferSize()); - MNN_PRINT("[NPU] loadModel %s \n", desc->GetName().c_str()); vector> model_desc; model_desc.push_back(desc); @@ -284,36 +282,12 @@ namespace MNN { void NPUBackend::setNetworkInput(const std::vector &inputs, const Op* op){ Tensor *inputTensor = inputs[0]; - MNN_PRINT("op name : %s \n op type : %s \n", op->name()->c_str(), EnumNameOpType(op->type())); - - for (size_t i = 0; i < inputs.size(); ++i){ - auto input = inputs[i]; - MNN_PRINT("\n"); - MNN_PRINT("in nchw : %d, %d, %d, %d \n", input->batch(), input->channel(), input->height(), input->width()); - for (size_t i = 0; i < input->buffer().dimensions; i++){ - MNN_PRINT("%d , ", input->buffer().dim[i].extent); - } - MNN_PRINT("\n"); - } - - // for (size_t i = 0; i < outputs.size(); i++){ - // auto output = outputs[i]; - // MNN_PRINT("\n"); - // MNN_PRINT("out nchw : %d, %d, %d, %d \n", output->batch(), output->channel(), output->height(), output->width()); - // for (size_t i = 0; i < output->buffer().dimensions; i++){ - // MNN_PRINT("%d , ", output->buffer().dim[i].extent); - // } - // MNN_PRINT("\n"); - // } - auto inputIndex = op->inputIndexes()->data()[0]; auto outputIndex = op->outputIndexes()->data()[0]; bool isInput = TensorUtils::getDescribe(inputTensor)->usage==Tensor::InsideDescribe::Usage::INPUT; if (isInput && mGrapMap.find(inputIndex) == mGrapMap.end()) { auto opName = string("input") + to_string(inputIndex); shared_ptr data(new ge::op::Data(opName)); - MNN_PRINT("input format : %d \n", TensorUtils::getDescribe(inputTensor)->dimensionFormat); - MNN_PRINT("shape : [%d, %d, %d, %d] \n", inputTensor->buffer().dim[0].extent, inputTensor->buffer().dim[1].extent, inputTensor->buffer().dim[2].extent, inputTensor->buffer().dim[3].extent); auto shape = tensorShapeFormat(inputTensor); ge::TensorDesc desc(ge::Shape(shape), ge::FORMAT_NCHW, ge::DT_FLOAT); data->update_input_desc_x(desc); @@ -333,14 +307,24 @@ namespace MNN { auto iter = map->find(op->type()); if (iter == map->end()) { - MNN_PRINT("[NPU] Don't support type %d, %s\n", op->type(), op->name()->c_str()); + MNN_ERROR("map not find !!! \n"); + if(op != nullptr){ + if(op->name() != nullptr){ + MNN_PRINT("[NPU] Don't support type %d, %s\n", op->type(), op->name()->c_str()); + } + } return nullptr; } auto exe = iter->second->onCreate(inputs, outputs, op, this); if (nullptr == exe) { - MNN_PRINT("[NPU] The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str()); + MNN_ERROR("nullptr == exe !!! \n"); + if(op != nullptr){ + if(op->name() != nullptr){ + MNN_PRINT("[NPU] The Creator Don't support type %d, %s\n", op->type(), op->name()->c_str()); + } + } return nullptr; } @@ -360,9 +344,6 @@ namespace MNN { if(isInputCopy){ mInputMap.insert(make_pair((unsigned long)tensor, mInputMap.size())); } - if(isOutputCopy){ - mOutputMap.insert(make_pair((unsigned long)tensor, mOutputMap.size())); - } return true; } @@ -393,9 +374,20 @@ namespace MNN { memcpy(input->GetBuffer(), tmpTensor->host(), (size_t)tmpTensor->size()); } else if(isOutputCopy){ - auto index = mOutputMap.find((unsigned long)(const_cast(srcTensor))); - MNN_ASSERT(index != mOutputMap.end()); - shared_ptr output = mOutputTensors[index->second]; + int index; + bool flag = false; + for(index = 0; index < mMNNOutTensors.size(); index++) { + if(mMNNOutTensors[index] == srcTensor) { + flag = true; + break; + } + } + if(flag == false) { + MNN_PRINT("MNNTensor and HIAITensor mismatch!"); + return; + } + + shared_ptr output = mOutputTensors[index]; auto tmpShape = tensorShapeFormat(srcTensor); vector srcShape = {(int)tmpShape[0],(int)tmpShape[1],(int)tmpShape[2],(int)tmpShape[3]}; shared_ptr tmpTensor(Tensor::create(srcShape,halide_type_of(), @@ -404,17 +396,15 @@ namespace MNN { auto shape = output->GetTensorDimension(); tensorConvert(tmpTensor.get(), dstTensor); } - - // setTensorIndex(); } void NPUBackend::onResizeBegin() { mGrapMap.clear(); - mGrapIOMap.clear(); + mOutGEOpMap.clear(); mInputOps.clear(); - mOutputOps.clear(); mInputTensors.clear(); mOutputTensors.clear(); + mMNNOutTensors.clear(); mSclipMap.clear(); } @@ -442,13 +432,26 @@ namespace MNN { input->Init(&in_dim); mInputTensors.push_back(input); } + auto index =0; for (auto out_dim : mOutputDimension) { shared_ptr output = make_shared(); + MNN_PRINT("%d HiAiTensor output DIM:%u,%u,%u,%u\n", index, + out_dim.GetNumber(), out_dim.GetChannel(), + out_dim.GetHeight(), out_dim.GetWidth()); output->Init(&out_dim); mOutputTensors.push_back(output); + index++; + } + index = 0; + for(auto opMap : mOutGEOpMap){ + for(auto tensor: opMap.second){ + mMNNOutTensors.push_back(tensor); + MNN_PRINT("%d MNNTensor output DIM:%d,%d,%d,%d\n",index, + tensor->batch(),tensor->channel(),tensor->height(),tensor->width()); + index++; + } } - return 0; } @@ -460,13 +463,18 @@ namespace MNN { for (auto input : mInputOps){ inputs.push_back(input.second[0]); } + std::vector outputOps; + for(auto outOp : mOutGEOpMap) { + outputOps.push_back(*outOp.first.get()); + } + MNN_PRINT("mOutputOps : %lu \n", outputOps.size()); string graphName = string("Graph1"); string version = string("model_v000011"); string modelName = to_string(0); mModelName.push_back(modelName); ge::Graph graph(graphName); - graph.SetInputs(inputs).SetOutputs(mOutputOps); + graph.SetInputs(inputs).SetOutputs(outputOps); ge::Model model(modelName, version); model.SetGraph(graph); @@ -475,11 +483,11 @@ namespace MNN { domi::HiaiIrBuild ir_build; domi::ModelBufferData om_model_buff; - ge::Buffer buffer; - ge::GraphErrCodeStatus geret = model.Save(buffer); - if(geret != 0) { - MNN_ERROR("[NPU] Model save failed \n"); - } + // ge::Buffer buffer; + // ge::GraphErrCodeStatus geret = model.Save(buffer); + // if(geret != 0) { + // MNN_ERROR("[NPU] Model save failed \n"); + // } //WriteToBufferFile(buffer, "/data/local/tmp/test.irpb"); @@ -544,18 +552,8 @@ namespace MNN { return ops[index]; } - void NPUBackend::setOutputIOOps(const Op *op, vector>&& HIAI_op){ - for (size_t i = 0; i < op->outputIndexes()->size(); i++){ - auto index = op->outputIndexes()->data()[i]; - vector, string>> ops; - for (size_t j = 0; j < HIAI_op.size(); j++){ - ops.emplace_back(make_pair(HIAI_op[j], "")); - } - mGrapIOMap.insert(make_pair(index, ops)); - } - } - - void NPUBackend::setOutputOps(const Op *op, vector>&& HIAI_op){ + void NPUBackend::setOutputOps(const Op *op, vector>&& HIAI_op, + const std::vector &outputs){ if(op->type() == OpType_Slice){ for (size_t i = 0; i < op->outputIndexes()->size(); i++){ auto index = op->outputIndexes()->data()[i]; @@ -570,8 +568,20 @@ namespace MNN { } mGrapMap.insert(make_pair(index, ops)); } - } + MNNTensorList tensors; + for (auto out: outputs) + { + bool isOutput = (TensorUtils::getDescribe(out)->usage + ==Tensor::InsideDescribe::Usage::OUTPUT); + if(isOutput == true){ + tensors.push_back(out); + } + } + if(!tensors.empty()) { + mOutGEOpMap.insert(make_pair(HIAI_op[HIAI_op.size()-1], tensors)); + } + } NPURuntime::NPURuntime(const Backend::Info& info) { mInfo = info; @@ -588,7 +598,7 @@ namespace MNN { NPURuntime::~NPURuntime() {} - Backend* NPURuntime::onCreate() const { + Backend* NPURuntime::onCreate(const BackendConfig* config) const { return new NPUBackend(this); } @@ -602,7 +612,6 @@ namespace MNN { struct NPUBackendCreator : RuntimeCreator { virtual Runtime* onCreate(const Backend::Info& info) const override { - AUTOTIME; { shared_ptr mgrClient = make_shared(); if(mgrClient.get() == nullptr){ diff --git a/source/backend/hiai/backend/NPUBackend.hpp b/source/backend/hiai/backend/NPUBackend.hpp index 053235bd..d21811e4 100644 --- a/source/backend/hiai/backend/NPUBackend.hpp +++ b/source/backend/hiai/backend/NPUBackend.hpp @@ -31,7 +31,7 @@ using namespace std; namespace MNN { - + typedef std::vector MNNTensorList; void NHWC2NCHW(const float* source, float* dest, int b, int c, int area); inline std::vector tensorShapeFormat(const Tensor *input, const Tensor *broadCastInput=nullptr) { auto dimSize = input->buffer().dimensions; @@ -206,7 +206,7 @@ namespace MNN { NPURuntime(const Backend::Info& info); virtual ~NPURuntime(); virtual CompilerType onGetCompilerType() const override; - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* conf) const override; virtual void onGabageCollect(int level) override; // If buffer is not nullptr, try copy cache, else delete cache virtual bool onSetCache(const void* buffer, size_t size) override { @@ -269,9 +269,8 @@ namespace MNN { shared_ptr getInputOps(const Op *op, int index = 0); - void setOutputOps(const Op *op, vector>&& HIAI_op); - void setOutputIOOps(const Op *op, vector>&& HIAI_op); - + void setOutputOps(const Op *op, vector>&& HIAI_op, + const std::vector &outputs); void setNetworkInput(const std::vector &inputs, const Op* op); private: @@ -281,14 +280,12 @@ namespace MNN { public: map, string>>> mGrapMap; - map, string>>> mGrapIOMap; + map, MNNTensorList> mOutGEOpMap; map> mInputOps; - std::vector mOutputOps; map mSclipMap; map mInputMap; - map mOutputMap; public: class Creator { public: @@ -308,6 +305,7 @@ namespace MNN { vector> mInputTensors; vector> mOutputTensors; + MNNTensorList mMNNOutTensors; const NPURuntime* mNPURuntime; BackendConfig::PrecisionMode mPrecision; }; diff --git a/source/backend/hiai/execution/NPUActivation.cpp b/source/backend/hiai/execution/NPUActivation.cpp index a4efd3c2..61b5130f 100644 --- a/source/backend/hiai/execution/NPUActivation.cpp +++ b/source/backend/hiai/execution/NPUActivation.cpp @@ -43,21 +43,14 @@ ErrorCode NPUActivation::onResize(const std::vector &inputs, const std (*prelu) .set_input_x(*xOp.get()).set_input_weight(mConst_w); - mNpuBackend->setOutputOps(mOp, {prelu}); + mNpuBackend->setOutputOps(mOp, {prelu}, outputs); }else{ shared_ptr relu(new ge::op::Activation(opName + "_relu")); - if (mType == 1 && mOp->main_as_Relu()->slope() != 0.0f) { - //Leaky Relu - float slope = mOp->main_as_Relu()->slope(); - mType = 5; - (*relu) - .set_attr_negative_slope(slope); - } (*relu) .set_input_x(*xOp.get()) .set_attr_coef(.000000) .set_attr_mode(mType); - mNpuBackend->setOutputOps(mOp, {relu}); + mNpuBackend->setOutputOps(mOp, {relu}, outputs); } @@ -92,4 +85,4 @@ NPUCreatorRegister __sigmoid_op(OpType_Sigmoid); NPUCreatorRegister __prelu_op(OpType_PReLU); NPUCreatorRegister __tanh_op(OpType_TanH); -} // namespace MNN +} // namespace MNN \ No newline at end of file diff --git a/source/backend/hiai/execution/NPUArgMax.cpp b/source/backend/hiai/execution/NPUArgMax.cpp index e6b70679..86aa47f0 100644 --- a/source/backend/hiai/execution/NPUArgMax.cpp +++ b/source/backend/hiai/execution/NPUArgMax.cpp @@ -39,7 +39,7 @@ ErrorCode NPUArgMax::onResize(const std::vector &inputs, const std::ve (*argMax) .set_input_x(*xOp.get()) .set_input_axis(mConst_axis); - mNpuBackend->setOutputOps(mOp, {argMax}); + mNpuBackend->setOutputOps(mOp, {argMax}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUBinary.cpp b/source/backend/hiai/execution/NPUBinary.cpp index 2f162b14..5b38f44d 100644 --- a/source/backend/hiai/execution/NPUBinary.cpp +++ b/source/backend/hiai/execution/NPUBinary.cpp @@ -14,56 +14,58 @@ using namespace std; namespace MNN { -void NPUBinary::OpInsert(int binary_type, vector, string>>& ops, string opName, ge::Operator& input0, ge::Operator& input1){ +void NPUBinary::OpInsert(int binary_type, string opName, + ge::Operator& input0, ge::Operator& input1, + const std::vector &outputs){ if(binary_type == BinaryOpOperation_ADD) { shared_ptr binary(new ge::op::Add(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_MUL) { shared_ptr binary(new ge::op::Mul(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x(input0) .set_input_y(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_REALDIV) { shared_ptr binary(new ge::op::RealDiv(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_SUB) { shared_ptr binary(new ge::op::Sub(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_MINIMUM) { shared_ptr binary(new ge::op::Minimum(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_MAXIMUM) { shared_ptr binary(new ge::op::Maximum(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_EQUAL) { shared_ptr binary(new ge::op::Equal(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); } else if(binary_type == BinaryOpOperation_LESS_EQUAL) { shared_ptr binary(new hiai::op::LessEqual(opName)); - ops.emplace_back(make_pair(binary, "")); (*binary) .set_input_x1(input0) .set_input_x2(input1); + mNpuBackend->setOutputOps(mOp, {binary}, outputs); }else{ MNN_ERROR("npu binary not support type : %d \n", binary_type); MNN_ASSERT(false); @@ -154,14 +156,14 @@ ErrorCode NPUBinary::onResize(const std::vector &inputs, const std::ve auto iops0 = mNpuBackend->mGrapMap[inputIndex0]; // x auto xOp0 = iops0.back().first; - OpInsert(binary_type, ops, opName, *xOp0.get(), mConst); + OpInsert(binary_type, opName, *xOp0.get(), mConst, outputs); }else if(isConst0 && !isConst1){ // auto inputIndex1 = mOp->inputIndexes()->data()[1]; auto iops1 = mNpuBackend->mGrapMap[inputIndex1]; // x auto xOp1 = iops1.back().first; - OpInsert(binary_type, ops, opName, mConst, *xOp1.get()); + OpInsert(binary_type, opName, mConst, *xOp1.get(), outputs); }else{ @@ -175,12 +177,11 @@ ErrorCode NPUBinary::onResize(const std::vector &inputs, const std::ve auto iops1 = mNpuBackend->mGrapMap[inputIndex1]; // x auto xOp1 = iops1.front().first; - OpInsert(binary_type, ops, opName, *xOp0.get(), *xOp1.get()); + OpInsert(binary_type, opName, *xOp0.get(), *xOp1.get(), outputs); } auto index = mOp->outputIndexes()->data()[0]; - mNpuBackend->mGrapMap.insert(make_pair(index, ops)); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUBinary.hpp b/source/backend/hiai/execution/NPUBinary.hpp index 1f1bab2d..a3144bdc 100644 --- a/source/backend/hiai/execution/NPUBinary.hpp +++ b/source/backend/hiai/execution/NPUBinary.hpp @@ -16,7 +16,9 @@ namespace MNN { class NPUBinary : public NPUCommonExecution { public: - void OpInsert(int binary_type, vector, string>>& ops, string opName, ge::Operator& input0, ge::Operator& input1); + void OpInsert(int binary_type, string opName, + ge::Operator& input0, ge::Operator& input1, + const std::vector &outputs); NPUBinary(Backend *b, const Op *op, const std::vector &inputs, const std::vector &outputs); ErrorCode onResize(const std::vector &inputs, const std::vector &outputs); virtual ~NPUBinary() = default; diff --git a/source/backend/hiai/execution/NPUCast.cpp b/source/backend/hiai/execution/NPUCast.cpp index 30e1b957..a2b1b14f 100644 --- a/source/backend/hiai/execution/NPUCast.cpp +++ b/source/backend/hiai/execution/NPUCast.cpp @@ -63,7 +63,7 @@ ErrorCode NPUCast::onResize(const std::vector &inputs, const std::vect .set_input_x(*xOp.get()) .set_attr_SrcT(mapDataType(srcT)) .set_attr_DstT(mapDataType(dstT)); - mNpuBackend->setOutputOps(mOp, {castOp}); + mNpuBackend->setOutputOps(mOp, {castOp}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConcat.cpp b/source/backend/hiai/execution/NPUConcat.cpp index 04dcaafa..bba38f55 100644 --- a/source/backend/hiai/execution/NPUConcat.cpp +++ b/source/backend/hiai/execution/NPUConcat.cpp @@ -36,7 +36,7 @@ ErrorCode NPUConcat::onResize(const std::vector &inputs, const std::ve (*concat).set_dynamic_input_x(i + 1, *px); } - mNpuBackend->setOutputOps(mOp, {concat}); + mNpuBackend->setOutputOps(mOp, {concat}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConvertTensor.cpp b/source/backend/hiai/execution/NPUConvertTensor.cpp index 8ae0b310..c1657491 100644 --- a/source/backend/hiai/execution/NPUConvertTensor.cpp +++ b/source/backend/hiai/execution/NPUConvertTensor.cpp @@ -20,25 +20,31 @@ ErrorCode NPUConvertTensor::onResize(const std::vector &inputs, const mNpuBackend->setNetworkInput(inputs, mOp); auto opName = mOp->name()->str(); - - shared_ptr convertTensor(new ge::op::Reshape(opName)); - - vector shapeDims = {outputs[0]->batch(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width()}; - auto xOp = mNpuBackend->getInputOps(mOp); - int index = mOp->inputIndexes()->data()[0]; - auto iter = mNpuBackend->mSclipMap.find(index); - if(iter != mNpuBackend->mSclipMap.end()){ - (*convertTensor).SetInput(0, *xOp, mNpuBackend->mSclipMap[index]); - (*convertTensor).set_attr_shape( - ge::AttrValue::LIST_INT(shapeDims)); - }else{ - (*convertTensor).set_input_tensor(*xOp).set_attr_shape( - ge::AttrValue::LIST_INT(shapeDims)); - } - - mNpuBackend->setOutputOps(mOp, {convertTensor}); + if (outputs[0]->buffer().dimensions==2) { //These conditions require special processing dimensions, not simple reshape, but equivalent transposes + shared_ptr permute1(new ge::op::Permute(opName)); + (*permute1) + .set_input_x(*xOp.get()) + .set_attr_order(ge::AttrValue::LIST_INT({2,1,0,3})); + mNpuBackend->setOutputOps(mOp, {permute1}, outputs); + } else { + shared_ptr convertTensor(new ge::op::Reshape(opName)); + + vector shapeDims = {outputs[0]->batch(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width()}; + + int index = mOp->inputIndexes()->data()[0]; + auto iter = mNpuBackend->mSclipMap.find(index); + if(iter != mNpuBackend->mSclipMap.end()){ + (*convertTensor).SetInput(0, *xOp, mNpuBackend->mSclipMap[index]); + (*convertTensor).set_attr_shape( + ge::AttrValue::LIST_INT(shapeDims)); + }else{ + (*convertTensor).set_input_tensor(*xOp).set_attr_shape( + ge::AttrValue::LIST_INT(shapeDims)); + } + mNpuBackend->setOutputOps(mOp, {convertTensor}, outputs); + } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConvolution.cpp b/source/backend/hiai/execution/NPUConvolution.cpp index 27d34aff..0c8523d0 100644 --- a/source/backend/hiai/execution/NPUConvolution.cpp +++ b/source/backend/hiai/execution/NPUConvolution.cpp @@ -115,9 +115,9 @@ ErrorCode NPUConvolution::onResize(const std::vector &inputs, const st } if (relu || relu6) { - mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}); + mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs); }else{ - mNpuBackend->setOutputOps(mOp, {conv}); + mNpuBackend->setOutputOps(mOp, {conv}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp index eff802ee..47f0bbda 100644 --- a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp +++ b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp @@ -110,9 +110,9 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector &inputs, } if (relu || relu6) { - mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}); + mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs); }else{ - mNpuBackend->setOutputOps(mOp, {conv}); + mNpuBackend->setOutputOps(mOp, {conv}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp b/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp index 79981032..801b5b9e 100644 --- a/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp +++ b/source/backend/hiai/execution/NPUConvolutionDepthwiseInt8.cpp @@ -105,9 +105,9 @@ ErrorCode NPUConvolutionDepthwiseInt8::onResize(const std::vector &inp } if (relu || relu6) { - mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}); + mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs); }else{ - mNpuBackend->setOutputOps(mOp, {conv}); + mNpuBackend->setOutputOps(mOp, {conv}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConvolutionInt8.cpp b/source/backend/hiai/execution/NPUConvolutionInt8.cpp index 0a47e3c4..96ce6d1a 100644 --- a/source/backend/hiai/execution/NPUConvolutionInt8.cpp +++ b/source/backend/hiai/execution/NPUConvolutionInt8.cpp @@ -110,9 +110,9 @@ ErrorCode NPUConvolutionInt8::onResize(const std::vector &inputs, cons } if (relu || relu6) { - mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}); + mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs); }else{ - mNpuBackend->setOutputOps(mOp, {conv}); + mNpuBackend->setOutputOps(mOp, {conv}, outputs); } }else{ vector filter_scale(int32ToInt8Scale, int32ToInt8Scale + quantizedParams->scale()->size()); @@ -169,9 +169,9 @@ ErrorCode NPUConvolutionInt8::onResize(const std::vector &inputs, cons } if (relu || relu6) { - mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}); + mNpuBackend->setOutputOps(mOp, {conv, mRelu_conv}, outputs); }else{ - mNpuBackend->setOutputOps(mOp, {conv}); + mNpuBackend->setOutputOps(mOp, {conv}, outputs); } } return NO_ERROR; diff --git a/source/backend/hiai/execution/NPUDeconvolution.cpp b/source/backend/hiai/execution/NPUDeconvolution.cpp index 2a85c960..204829eb 100644 --- a/source/backend/hiai/execution/NPUDeconvolution.cpp +++ b/source/backend/hiai/execution/NPUDeconvolution.cpp @@ -98,9 +98,9 @@ ErrorCode NPUDeconvolution::onResize(const std::vector &inputs, const } if (relu || relu6) { - mNpuBackend->setOutputOps(mOp, {deconv, mRelu_conv}); + mNpuBackend->setOutputOps(mOp, {deconv, mRelu_conv}, outputs); }else{ - mNpuBackend->setOutputOps(mOp, {deconv}); + mNpuBackend->setOutputOps(mOp, {deconv}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUDepthToSpace.cpp b/source/backend/hiai/execution/NPUDepthToSpace.cpp index d8dcf57c..3c0cff62 100644 --- a/source/backend/hiai/execution/NPUDepthToSpace.cpp +++ b/source/backend/hiai/execution/NPUDepthToSpace.cpp @@ -51,7 +51,7 @@ ErrorCode NPUDepthToSpace::onResize(const std::vector &inputs, const s .set_attr_order({0,3,1,2}) .SetAttr("NHWC_to_NCHW", ge::AttrValue::CreateFrom(1)); - mNpuBackend->setOutputOps(mOp, {permuteBefore, depthToSpace, permuteAfter}); + mNpuBackend->setOutputOps(mOp, {permuteBefore, depthToSpace, permuteAfter}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUEltwise.cpp b/source/backend/hiai/execution/NPUEltwise.cpp index c4545c6c..f6194e1d 100644 --- a/source/backend/hiai/execution/NPUEltwise.cpp +++ b/source/backend/hiai/execution/NPUEltwise.cpp @@ -44,7 +44,7 @@ ErrorCode NPUEltwise::onResize(const std::vector &inputs, const std::v (*sub) .set_input_x1(*xOp1.get()) .set_input_x2(*xOp2.get()); - mNpuBackend->setOutputOps(mOp, {sub}); + mNpuBackend->setOutputOps(mOp, {sub}, outputs); } else { (*eltwise) .set_input_x1(*xOp1.get()) @@ -53,7 +53,7 @@ ErrorCode NPUEltwise::onResize(const std::vector &inputs, const std::v .set_attr_weight(ge::AttrValue::LIST_TENSOR{}) .set_attr_mode(param->type()); // 0:product,1:sum,2:max;default is CC_ELTWISE_SUM. TODO SUB Weight - mNpuBackend->setOutputOps(mOp, {eltwise}); + mNpuBackend->setOutputOps(mOp, {eltwise}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUEltwiseInt8.cpp b/source/backend/hiai/execution/NPUEltwiseInt8.cpp index f7ef89f7..22d1cbad 100644 --- a/source/backend/hiai/execution/NPUEltwiseInt8.cpp +++ b/source/backend/hiai/execution/NPUEltwiseInt8.cpp @@ -157,7 +157,7 @@ ErrorCode NPUEltwiseInt8::onResize(const std::vector &inputs, const st .set_input_clip_value_min(mConstMin) .set_input_clip_value_max(mConstMax); - mNpuBackend->setOutputOps(mOp, {scale0, scale1, clip0, clip1, eltwise, clip}); + mNpuBackend->setOutputOps(mOp, {scale0, scale1, clip0, clip1, eltwise, clip}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUExpandDims.cpp b/source/backend/hiai/execution/NPUExpandDims.cpp index c48e188f..c43a2fd5 100644 --- a/source/backend/hiai/execution/NPUExpandDims.cpp +++ b/source/backend/hiai/execution/NPUExpandDims.cpp @@ -30,7 +30,7 @@ ErrorCode NPUExpandDims::onResize(const std::vector &inputs, const std (*prob).set_input_tensor(*xOp.get()).set_attr_shape(ge::AttrValue::LIST_INT(shape)); - mNpuBackend->setOutputOps(mOp, {prob}); + mNpuBackend->setOutputOps(mOp, {prob}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUFloatToInt8.cpp b/source/backend/hiai/execution/NPUFloatToInt8.cpp index 149a9c1c..d11d7a28 100644 --- a/source/backend/hiai/execution/NPUFloatToInt8.cpp +++ b/source/backend/hiai/execution/NPUFloatToInt8.cpp @@ -68,7 +68,7 @@ ErrorCode NPUFloatToInt8::onResize(const std::vector &inputs, const st .set_input_clip_value_min(mConstMin) .set_input_clip_value_max(mConstMax); - mNpuBackend->setOutputOps(mOp, {floatToInt8, clip}); + mNpuBackend->setOutputOps(mOp, {floatToInt8, clip}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUGatherV2.cpp b/source/backend/hiai/execution/NPUGatherV2.cpp index c26051f1..f28f859e 100644 --- a/source/backend/hiai/execution/NPUGatherV2.cpp +++ b/source/backend/hiai/execution/NPUGatherV2.cpp @@ -104,7 +104,7 @@ ErrorCode NPUGatherV2::onResize(const std::vector &inputs, const std:: .set_attr_axis(axis); } - mNpuBackend->setOutputOps(mOp, {prob}); + mNpuBackend->setOutputOps(mOp, {prob}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUInstanceNorm.cpp b/source/backend/hiai/execution/NPUInstanceNorm.cpp index 0e4b029d..b465db88 100644 --- a/source/backend/hiai/execution/NPUInstanceNorm.cpp +++ b/source/backend/hiai/execution/NPUInstanceNorm.cpp @@ -45,7 +45,7 @@ ErrorCode NPUInstanceNorm::onResize(const std::vector &inputs, const s .set_input_gamma(mScale) .set_input_beta(mBias); - mNpuBackend->setOutputOps(mOp, {insNorm}); + mNpuBackend->setOutputOps(mOp, {insNorm}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUInt8ToFloat.cpp b/source/backend/hiai/execution/NPUInt8ToFloat.cpp index 9857cece..070dc336 100644 --- a/source/backend/hiai/execution/NPUInt8ToFloat.cpp +++ b/source/backend/hiai/execution/NPUInt8ToFloat.cpp @@ -72,7 +72,7 @@ ErrorCode NPUInt8ToFloat::onResize(const std::vector &inputs, const st .set_input_x(*clip) .set_input_filter(mConst_fliter); - mNpuBackend->setOutputOps(mOp, {clip, int8ToFloat}); + mNpuBackend->setOutputOps(mOp, {clip, int8ToFloat}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUInterp.cpp b/source/backend/hiai/execution/NPUInterp.cpp index 54a14f61..503715ea 100644 --- a/source/backend/hiai/execution/NPUInterp.cpp +++ b/source/backend/hiai/execution/NPUInterp.cpp @@ -39,25 +39,19 @@ ErrorCode NPUInterp::onResize(const std::vector &inputs, const std::ve (*interp).set_input_x(*xOp) .set_input_size(mConstShape) .set_attr_align_corners(param->alignCorners()); - mNpuBackend->setOutputOps(mOp, {interp}); + mNpuBackend->setOutputOps(mOp, {interp}, outputs); } else if(resizeType == 2) { - shared_ptr interp(new hiai::op::ResizeBilinearV2(opName)); + shared_ptr interp(new hiai::op::ResizeBilinear(opName)); (*interp).set_input_x(*xOp) .set_input_size(mConstShape) - .set_attr_align_corners(param->alignCorners()) - .set_attr_half_pixel_centers(param->halfPixelCenters() || - param->ctm() == CoordinateTransformationMode_PytorchHalfPixels || - param->ctm() == CoordinateTransformationMode_TensorflowHalfPixels); - mNpuBackend->setOutputOps(mOp, {interp}); + .set_attr_align_corners(param->alignCorners()); + mNpuBackend->setOutputOps(mOp, {interp}, outputs); } else if(resizeType == 3) { - shared_ptr interp(new hiai::op::ResizeBilinearV2(opName)); + shared_ptr interp(new hiai::op::ResizeBilinear(opName)); (*interp).set_input_x(*xOp) .set_input_size(mConstShape) - .set_attr_align_corners(param->alignCorners()) - .set_attr_half_pixel_centers(param->halfPixelCenters() || - param->ctm() == CoordinateTransformationMode_PytorchHalfPixels || - param->ctm() == CoordinateTransformationMode_TensorflowHalfPixels); - mNpuBackend->setOutputOps(mOp, {interp}); + .set_attr_align_corners(param->alignCorners()); + mNpuBackend->setOutputOps(mOp, {interp}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUMatmul.cpp b/source/backend/hiai/execution/NPUMatmul.cpp index 04022c68..efaac529 100644 --- a/source/backend/hiai/execution/NPUMatmul.cpp +++ b/source/backend/hiai/execution/NPUMatmul.cpp @@ -64,7 +64,7 @@ ErrorCode NPUMatmul::onResize(const std::vector &inputs, const std::ve auto shape = tensorShapeFormat(outputs[0]); (*reshape3).set_input_tensor(*matmul).set_attr_shape(ge::AttrValue::LIST_INT(shape)); - mNpuBackend->setOutputOps(mOp, {reshape, matmul, reshape3}); + mNpuBackend->setOutputOps(mOp, {reshape, matmul, reshape3}, outputs); }else{ //hangxing todo @@ -87,7 +87,7 @@ ErrorCode NPUMatmul::onResize(const std::vector &inputs, const std::ve shared_ptr reshape3(new ge::op::Reshape(opName + "_reshape3")); (*reshape3).set_input_tensor(*permute).set_attr_shape(ge::AttrValue::LIST_INT({1, outputs[0]->buffer().dim[1].extent, outputs[0]->buffer().dim[0].extent, 1})); - mNpuBackend->setOutputOps(mOp, {reshape, reshape2, matmul, permute, reshape3}); + mNpuBackend->setOutputOps(mOp, {reshape, reshape2, matmul, permute, reshape3}, outputs); } return NO_ERROR; diff --git a/source/backend/hiai/execution/NPUPack.cpp b/source/backend/hiai/execution/NPUPack.cpp index 62fb4725..ffbe0243 100644 --- a/source/backend/hiai/execution/NPUPack.cpp +++ b/source/backend/hiai/execution/NPUPack.cpp @@ -28,7 +28,7 @@ ErrorCode NPUPack::onResize(const std::vector &inputs, const std::vect .set_dynamic_input_values(0, *xOp.get()) .set_attr_axis(axisFormat(inputs[0], param->axis())); - mNpuBackend->setOutputOps(mOp, {pack}); + mNpuBackend->setOutputOps(mOp, {pack}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUPadding.cpp b/source/backend/hiai/execution/NPUPadding.cpp index af119ced..cd056b9a 100644 --- a/source/backend/hiai/execution/NPUPadding.cpp +++ b/source/backend/hiai/execution/NPUPadding.cpp @@ -47,7 +47,7 @@ ErrorCode NPUPadding::onResize(const std::vector &inputs, const std::v (*padding).set_input_x(*xOp.get()).set_input_paddings(mConst); - mNpuBackend->setOutputOps(mOp, {padding}); + mNpuBackend->setOutputOps(mOp, {padding}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUPooling.cpp b/source/backend/hiai/execution/NPUPooling.cpp index bf0b7007..54382116 100644 --- a/source/backend/hiai/execution/NPUPooling.cpp +++ b/source/backend/hiai/execution/NPUPooling.cpp @@ -76,7 +76,7 @@ ErrorCode NPUPooling::onResize(const std::vector &inputs, const std::v .set_attr_window(ge::AttrValue::LIST_INT({kernelH/2, kernelW/2})) .set_attr_stride(ge::AttrValue::LIST_INT({strideHeight, strideWidth})) .set_attr_global_pooling(poolParam->isGlobal()); - mNpuBackend->setOutputOps(mOp, {pooling2X2,pooling}); + mNpuBackend->setOutputOps(mOp, {pooling2X2,pooling}, outputs); } else { (*pooling) .set_input_x(*xOp.get()) @@ -90,7 +90,7 @@ ErrorCode NPUPooling::onResize(const std::vector &inputs, const std::v .set_attr_stride(ge::AttrValue::LIST_INT({strideHeight, strideWidth})) .set_attr_global_pooling(poolParam->isGlobal()); - mNpuBackend->setOutputOps(mOp, {pooling}); + mNpuBackend->setOutputOps(mOp, {pooling}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUReduction.cpp b/source/backend/hiai/execution/NPUReduction.cpp index ce7bf671..846ad9d5 100644 --- a/source/backend/hiai/execution/NPUReduction.cpp +++ b/source/backend/hiai/execution/NPUReduction.cpp @@ -69,13 +69,13 @@ ErrorCode NPUReduction::onResize(const std::vector &inputs, const std: (*reduction) .set_input_x(*xOp.get()).set_input_axes(mConstAxis) .set_attr_keep_dims(mOp->main_as_ReductionParam()->keepDims()); - mNpuBackend->setOutputOps(mOp, {reduction}); + mNpuBackend->setOutputOps(mOp, {reduction}, outputs); }else if(type == ReductionType_SUM) { shared_ptr reduction(new hiai::op::ReduceSum(opName)); (*reduction) .set_input_x(*xOp.get()).set_input_axes(mConstAxis) .set_attr_keep_dims(mOp->main_as_ReductionParam()->keepDims()); - mNpuBackend->setOutputOps(mOp, {reduction}); + mNpuBackend->setOutputOps(mOp, {reduction}, outputs); }else if(type == ReductionType_MEAN) { shared_ptr reduction(new hiai::op::ReduceMean(opName)); (*reduction) @@ -85,16 +85,16 @@ ErrorCode NPUReduction::onResize(const std::vector &inputs, const std: auto shapeDims = tensorShapeFormat(outputs[0]); shared_ptr reshape1(new ge::op::Reshape(opName+"reshape1")); (*reshape1).set_input_tensor(*reduction.get()).set_attr_shape(ge::AttrValue::LIST_INT(shapeDims)); - mNpuBackend->setOutputOps(mOp, {reduction,reshape1}); + mNpuBackend->setOutputOps(mOp, {reduction,reshape1}, outputs); } else { - mNpuBackend->setOutputOps(mOp, {reduction}); + mNpuBackend->setOutputOps(mOp, {reduction}, outputs); } } else if(type == ReductionType_ANY) { shared_ptr reduction(new ge::op::ReduceAll(opName)); (*reduction) .set_input_x(*xOp.get()).set_attr_axes(axis) .set_attr_keep_dims(mOp->main_as_ReductionParam()->keepDims()); - mNpuBackend->setOutputOps(mOp, {reduction}); + mNpuBackend->setOutputOps(mOp, {reduction}, outputs); }else{ MNN_ERROR("npu reducton not support type : %d \n", type); } diff --git a/source/backend/hiai/execution/NPUReshape.cpp b/source/backend/hiai/execution/NPUReshape.cpp index 671edfa0..a33dbee2 100644 --- a/source/backend/hiai/execution/NPUReshape.cpp +++ b/source/backend/hiai/execution/NPUReshape.cpp @@ -51,7 +51,7 @@ ErrorCode NPUReshape::onResize(const std::vector &inputs, const std::v if ((TensorUtils::getDescribe(input)->dimensionFormat != MNN::MNN_DATA_FORMAT_NHWC) || (isSameDims(input, outputs[0]) || (inputDims == shapeDims))) { (*reshape).set_input_tensor(*xOp).set_attr_shape(ge::AttrValue::LIST_INT(shapeDims)); - mNpuBackend->setOutputOps(mOp, {reshape}); + mNpuBackend->setOutputOps(mOp, {reshape}, outputs); } else { shared_ptr permute1(new ge::op::Permute(opName+"_perm1")); shared_ptr permute2(new ge::op::Permute(opName+"_perm2")); @@ -65,7 +65,7 @@ ErrorCode NPUReshape::onResize(const std::vector &inputs, const std::v (*permute2) .set_input_x(*reshape.get()) .set_attr_order(ge::AttrValue::LIST_INT({0,3,1,2})); - mNpuBackend->setOutputOps(mOp, {permute1,reshape,permute2}); + mNpuBackend->setOutputOps(mOp, {permute1,reshape,permute2}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUScale.cpp b/source/backend/hiai/execution/NPUScale.cpp index 5719a684..6a1ff637 100644 --- a/source/backend/hiai/execution/NPUScale.cpp +++ b/source/backend/hiai/execution/NPUScale.cpp @@ -51,7 +51,7 @@ ErrorCode NPUScale::onResize(const std::vector &inputs, const std::vec (*scale).set_input_x(*xOp.get()).set_input_filter(mConst_fliter).set_input_bias(mConst_bias).set_attr_has_bias_value(true); - mNpuBackend->setOutputOps(mOp, {scale}); + mNpuBackend->setOutputOps(mOp, {scale}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUSlice.cpp b/source/backend/hiai/execution/NPUSlice.cpp index 2fd2a9fa..0a846bc7 100644 --- a/source/backend/hiai/execution/NPUSlice.cpp +++ b/source/backend/hiai/execution/NPUSlice.cpp @@ -41,7 +41,7 @@ ErrorCode NPUSlice::onResize(const std::vector &inputs, const std::vec .set_attr_num_split(outputs.size()) .create_dynamic_output_y(outputs.size()); - mNpuBackend->setOutputOps(mOp, {slice}); + mNpuBackend->setOutputOps(mOp, {slice}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUSliceTf.cpp b/source/backend/hiai/execution/NPUSliceTf.cpp index d9c3ac22..873137b8 100644 --- a/source/backend/hiai/execution/NPUSliceTf.cpp +++ b/source/backend/hiai/execution/NPUSliceTf.cpp @@ -57,7 +57,7 @@ ErrorCode NPUSliceTf::onResize(const std::vector &inputs, const std::v (*slice).set_input_input(*xOp) .set_input_offsets(mConst_start) .set_input_size(mConst_size); - mNpuBackend->setOutputOps(mOp, {slice}); + mNpuBackend->setOutputOps(mOp, {slice}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUSoftmax.cpp b/source/backend/hiai/execution/NPUSoftmax.cpp index ed9c0893..4d728173 100644 --- a/source/backend/hiai/execution/NPUSoftmax.cpp +++ b/source/backend/hiai/execution/NPUSoftmax.cpp @@ -59,14 +59,14 @@ ErrorCode NPUSoftmax::onResize(const std::vector &inputs, const std::v shared_ptr mul(new ge::op::Mul(opName + "_mul")); (*mul).set_input_x(*exp.get()).set_input_y(*rec.get()); - mNpuBackend->setOutputOps(mOp, {sub, exp, sum, rec, mul}); + mNpuBackend->setOutputOps(mOp, {sub, exp, sum, rec, mul}, outputs); }else{ shared_ptr softmax(new ge::op::Softmax(opName)); (*softmax).set_input_x(*xOp.get()).set_attr_axis(axisFormat(inputs[0], param->axis())).set_attr_algo(1); - mNpuBackend->setOutputOps(mOp, {softmax}); + mNpuBackend->setOutputOps(mOp, {softmax}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUSqueeze.cpp b/source/backend/hiai/execution/NPUSqueeze.cpp index bd7b49db..18021ac1 100644 --- a/source/backend/hiai/execution/NPUSqueeze.cpp +++ b/source/backend/hiai/execution/NPUSqueeze.cpp @@ -29,7 +29,7 @@ ErrorCode NPUSqueeze::onResize(const std::vector &inputs, const std::v (*prob).set_input_tensor(*xOp.get()).set_attr_shape(ge::AttrValue::LIST_INT(shape)); - mNpuBackend->setOutputOps(mOp, {prob}); + mNpuBackend->setOutputOps(mOp, {prob}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUStridedSlice.cpp b/source/backend/hiai/execution/NPUStridedSlice.cpp index f7e7a1d6..26fc11fc 100644 --- a/source/backend/hiai/execution/NPUStridedSlice.cpp +++ b/source/backend/hiai/execution/NPUStridedSlice.cpp @@ -91,7 +91,7 @@ ErrorCode NPUStridedSlice::onResize(const std::vector &inputs, const s .set_attr_new_axis_mask(newAxisMask) .set_attr_shrink_axis_mask(shrinkAxisMask); - mNpuBackend->setOutputOps(mOp, {stride_slice}); + mNpuBackend->setOutputOps(mOp, {stride_slice}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUTopKV2.cpp b/source/backend/hiai/execution/NPUTopKV2.cpp index 18661508..a22bf871 100644 --- a/source/backend/hiai/execution/NPUTopKV2.cpp +++ b/source/backend/hiai/execution/NPUTopKV2.cpp @@ -42,7 +42,7 @@ ErrorCode NPUTopKV2::onResize(const std::vector &inputs, const std::ve .set_input_x(*xOp.get()) .set_input_k(mConst_w); - mNpuBackend->setOutputOps(mOp, {prob}); + mNpuBackend->setOutputOps(mOp, {prob}, outputs); return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUTranspose.cpp b/source/backend/hiai/execution/NPUTranspose.cpp index ee60aaa5..dd1ca272 100644 --- a/source/backend/hiai/execution/NPUTranspose.cpp +++ b/source/backend/hiai/execution/NPUTranspose.cpp @@ -59,13 +59,13 @@ ErrorCode NPUTranspose::onResize(const std::vector &inputs, const std: if(isPermNoChange(permutation)) { shared_ptr reshape(new ge::op::Reshape(opName)); (*reshape).set_input_tensor(*xOp).set_attr_shape(ge::AttrValue::LIST_INT(shapeDims)); - mNpuBackend->setOutputOps(mOp, {reshape}); + mNpuBackend->setOutputOps(mOp, {reshape}, outputs); } else { shared_ptr permute(new ge::op::Permute(opName)); (*permute) .set_input_x(*xOp.get()) .set_attr_order(permutation); - mNpuBackend->setOutputOps(mOp, {permute}); + mNpuBackend->setOutputOps(mOp, {permute}, outputs); } return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUUnary.cpp b/source/backend/hiai/execution/NPUUnary.cpp index 4c92c3e1..d9527c51 100644 --- a/source/backend/hiai/execution/NPUUnary.cpp +++ b/source/backend/hiai/execution/NPUUnary.cpp @@ -28,20 +28,20 @@ ErrorCode NPUUnary::onResize(const std::vector &inputs, const std::vec if(unary_type == UnaryOpOperation_EXP){ shared_ptr unary(new ge::op::Exp(opName)); (*unary).set_input_x(*xOp.get()); - mNpuBackend->setOutputOps(mOp, {unary}); + mNpuBackend->setOutputOps(mOp, {unary}, outputs); }else if(unary_type == UnaryOpOperation_NEG){ shared_ptr unary(new ge::op::Neg(opName)); (*unary).set_input_x(*xOp.get()); - mNpuBackend->setOutputOps(mOp, {unary}); + mNpuBackend->setOutputOps(mOp, {unary}, outputs); }else if(unary_type == UnaryOpOperation_ABS){ shared_ptr unary(new ge::op::Activation(opName)); (*unary).set_input_x(*xOp.get()) .set_attr_mode(6); - mNpuBackend->setOutputOps(mOp, {unary}); + mNpuBackend->setOutputOps(mOp, {unary}, outputs); }else if(unary_type == UnaryOpOperation_SQRT){ shared_ptr unary(new ge::op::Sqrt(opName)); (*unary).set_input_x(*xOp.get()); - mNpuBackend->setOutputOps(mOp, {unary}); + mNpuBackend->setOutputOps(mOp, {unary}, outputs); }else{ MNN_ERROR("unary not support this case : %d \n", unary_type); } diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp index 6bfa8923..32f38335 100644 --- a/source/backend/metal/MetalBackend.hpp +++ b/source/backend/metal/MetalBackend.hpp @@ -37,7 +37,7 @@ public: MetalRuntime(); virtual ~ MetalRuntime(); - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* config) const override; virtual void onGabageCollect(int level) override; void *context() const { return mContext; diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm index c4b1231a..8e9776b0 100644 --- a/source/backend/metal/MetalBackend.mm +++ b/source/backend/metal/MetalBackend.mm @@ -168,12 +168,19 @@ Execution *MetalBackend::onCreate(const std::vector &inputs, const std const Op *op) { auto map = getCreatorMap(); auto iter = map->find(op->type()); + if (iter == map->end()) { + if (nullptr != op->name()) { + MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str()); + } else { + MNN_PRINT("Don't support type [%s]\n", EnumNameOpType(op->type())); + } return NULL; } + auto exe = iter->second->onCreate(inputs, op, this); if (NULL == exe) { - MNN_PRINT("The Creator Don't support type %d, %s\n", op->type(), op->name() ? op->name()->c_str() : ""); + MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : ""); return NULL; } return exe; @@ -512,7 +519,7 @@ MetalRuntime::MetalRuntime() { MetalRuntime::~ MetalRuntime() { CFRelease(mContext); } -Backend* MetalRuntime::onCreate() const { +Backend* MetalRuntime::onCreate(const BackendConfig* config) const { return new MetalBackend(this); } void MetalRuntime::onGabageCollect(int level) { diff --git a/source/backend/metal/MetalConvolution.metal b/source/backend/metal/MetalConvolution.metal index 2097fd8e..eb9a61e8 100644 --- a/source/backend/metal/MetalConvolution.metal +++ b/source/backend/metal/MetalConvolution.metal @@ -60,10 +60,10 @@ kernel void conv(const device ftype4 *in [[buffer(0)]], int offset_y = (int)gid.y * cst.stride_y - cst.pad_y; int sx = max(0, (UP_DIV(-offset_x, cst.dilation_x))); int ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x)); - short kw = ex - sx; + int kw = ex - sx; int sy = max(0, (UP_DIV(-offset_y, cst.dilation_y))); int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y)); - short kh = ey - sy; + int kh = ey - sy; offset_x += sx * cst.dilation_x; offset_y += sy * cst.dilation_y; @@ -72,7 +72,7 @@ kernel void conv(const device ftype4 *in [[buffer(0)]], auto z_out = out + (int)gid.z * cst.output_size + (int)gid.y * cst.output_width + (int)gid.x; int dilation_h = cst.input_width * cst.dilation_y; - float4 result = float4(biasTerms[(short)gid.z]); + float4 result = float4(biasTerms[(int)gid.z]); for (auto z = 0; z < cst.input_slice; z++) { for (auto y = 0; y < kh; y++) { for (auto x = 0; x < kw; x++) { @@ -100,10 +100,10 @@ kernel void conv_z4(const device ftype4 *in [[buffer(0)]], int offset_y = (int)gid.y * cst.stride_y - cst.pad_y; int sx = max(0, (UP_DIV(-offset_x, cst.dilation_x))); int ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x)); - short kw = ex - sx; + int kw = ex - sx; int sy = max(0, (UP_DIV(-offset_y, cst.dilation_y))); int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y)); - short kh = ey - sy; + int kh = ey - sy; offset_x += sx * cst.dilation_x; offset_y += sy * cst.dilation_y; @@ -138,18 +138,18 @@ kernel void conv_local(const device ftype4 *in [[buffer(0)]], const device ftype4x4 *wt [[buffer(3)]], const device ftype4 *biasTerms [[buffer(4)]], threadgroup ftype4x4 *cols [[threadgroup(0)]], - ushort3 gid [[thread_position_in_grid]], - ushort3 tid [[thread_position_in_threadgroup]], - ushort3 thread_size [[threads_per_threadgroup]]) { - short unroll_x = CONV_UNROLL * gid.x; - short offset_x = unroll_x * cst.stride_x - cst.pad_x; - short offset_y = gid.y * cst.stride_y - cst.pad_y; - short sy = max(0, UP_DIV(-offset_y, cst.dilation_y)); - short ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y)); + uint3 gid [[thread_position_in_grid]], + uint3 tid [[thread_position_in_threadgroup]], + uint3 thread_size [[threads_per_threadgroup]]) { + int unroll_x = CONV_UNROLL * gid.x; + int offset_x = unroll_x * cst.stride_x - cst.pad_x; + int offset_y = gid.y * cst.stride_y - cst.pad_y; + int sy = max(0, UP_DIV(-offset_y, cst.dilation_y)); + int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y)); auto o_wt = wt + (int)gid.z * cst.input_slice * cst.kernel_size; float4x4 result = float4x4(0); - short steps = UP_DIV(cst.input_slice, cst.threadgroup_input_slice); + int steps = UP_DIV(cst.input_slice, cst.threadgroup_input_slice); for (auto s = 0; s < steps; s++) { int sz_stt = s * cst.threadgroup_input_slice; @@ -181,7 +181,7 @@ kernel void conv_local(const device ftype4 *in [[buffer(0)]], threadgroup_barrier(mem_flags::mem_threadgroup); // gemm - if ((short)gid.z < cst.output_slice) { + if ((int)gid.z < cst.output_slice) { for (auto z = 0; z < sz_size; z++) { for (auto ky = sy; ky < ey; ky++) { for (auto kx = 0; kx < cst.kernel_x; kx++) { @@ -203,9 +203,9 @@ kernel void conv_local(const device ftype4 *in [[buffer(0)]], } // end step // save - if ((short)gid.z >= cst.output_slice) return; + if ((int)gid.z >= cst.output_slice) return; - float4 b4 = float4(biasTerms[(short)gid.z]); + float4 b4 = float4(biasTerms[(int)gid.z]); auto off_out = out + (int)gid.z * cst.output_size + (int)gid.y * cst.output_width + unroll_x; bool3 valids = (unroll_x + int3(1, 2, 3)) < cst.output_width; /* true */ off_out[0] = activate((ftype4)(result[0] + b4), cst.activation); diff --git a/source/backend/metal/MetalConvolution1x1.metal b/source/backend/metal/MetalConvolution1x1.metal index 3332aec5..5c7a28fd 100644 --- a/source/backend/metal/MetalConvolution1x1.metal +++ b/source/backend/metal/MetalConvolution1x1.metal @@ -38,7 +38,7 @@ kernel void conv1x1(const device ftype4 *in [[buffer(0)]], auto xy_in = in + (int)gid.z * cst.input_slice * cst.input_size + g * cst.input_size + (int)gid.x; auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + (int)gid.y * cst.output_size + (int)gid.x; - float4 result = float4(biasTerms[(short)gid.y]); + float4 result = float4(biasTerms[gid.y]); for (auto z = 0; z < cst.input_group_slice; z++, xy_in += cst.input_size) { result += float4(*xy_in * xy_wt[z]); } diff --git a/source/backend/metal/MetalConvolutionDepthwise.metal b/source/backend/metal/MetalConvolutionDepthwise.metal index 21d7f708..f3e17d12 100644 --- a/source/backend/metal/MetalConvolutionDepthwise.metal +++ b/source/backend/metal/MetalConvolutionDepthwise.metal @@ -39,23 +39,23 @@ kernel void conv_depthwise(const device ftype4 *in [[buffer(0)]], constant conv_dw_cst& cst [[buffer(2)]], const device ftype4 *wt [[buffer(3)]], const device ftype4 *biasTerms [[buffer(4)]], - ushort3 gid [[thread_position_in_grid]]) { + uint3 gid [[thread_position_in_grid]]) { if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.slice * cst.batch) return; - short oz = gid.z % cst.slice; - short offset_x = (int)gid.x * cst.stride_x - cst.pad_x; - short offset_y = (int)gid.y * cst.stride_y - cst.pad_y; - short sx = max(0, (UP_DIV(-offset_x, cst.dilation_x))); - short ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x)); - short sy = max(0, (UP_DIV(-offset_y, cst.dilation_y))); - short ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y)); + int oz = gid.z % cst.slice; + int offset_x = (int)gid.x * cst.stride_x - cst.pad_x; + int offset_y = (int)gid.y * cst.stride_y - cst.pad_y; + int sx = max(0, (UP_DIV(-offset_x, cst.dilation_x))); + int ex = min(cst.kernel_x, UP_DIV(cst.input_width - offset_x, cst.dilation_x)); + int sy = max(0, (UP_DIV(-offset_y, cst.dilation_y))); + int ey = min(cst.kernel_y, UP_DIV(cst.input_height - offset_y, cst.dilation_y)); offset_x += sx * cst.dilation_x; offset_y += sy * cst.dilation_y; auto z_wt = wt + (int)oz * cst.kernel_size; auto z_in = in + (int)gid.z * cst.input_size; auto z_out = out + (int)gid.z * cst.output_size + (int)gid.y * cst.output_width + (int)gid.x; - float4 result = float4(biasTerms[(short)oz]); + float4 result = float4(biasTerms[oz]); for (auto ky = sy, y = offset_y; ky < ey; ky++, y += cst.dilation_y) { for (auto kx = sx, x = offset_x; kx < ex; kx++, x += cst.dilation_x) { auto wt4 = z_wt[ky * cst.kernel_x + kx]; @@ -63,5 +63,6 @@ kernel void conv_depthwise(const device ftype4 *in [[buffer(0)]], result += float4(in4 * wt4); } } + *z_out = activate((ftype4)result, cst.activation); } diff --git a/source/backend/metal/MetalDeconvolution.metal b/source/backend/metal/MetalDeconvolution.metal index dfa7aff5..1cd21d02 100644 --- a/source/backend/metal/MetalDeconvolution.metal +++ b/source/backend/metal/MetalDeconvolution.metal @@ -47,24 +47,24 @@ kernel void deconv(const device ftype4 *in [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return; - short b = gid.z / cst.output_slice; - short o = gid.z % cst.output_slice; + int b = gid.z / cst.output_slice; + int o = gid.z % cst.output_slice; float4 result = cst.has_bias ? float4(biasTerms[o]) : 0; - short oy = (short)gid.y + cst.pad_y; - short ox = (short)gid.x + cst.pad_x; - short max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y); - short max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x); - short min_ky = UP_DIV(oy - max_sy, cst.dilation_y); - short min_kx = UP_DIV(ox - max_sx, cst.dilation_x); + int oy = (int)gid.y + cst.pad_y; + int ox = (int)gid.x + cst.pad_x; + int max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y); + int max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x); + int min_ky = UP_DIV(oy - max_sy, cst.dilation_y); + int min_kx = UP_DIV(ox - max_sx, cst.dilation_x); if ((oy - min_ky * cst.dilation_y) % cst.stride_y == 0 && (ox - min_kx * cst.dilation_x) % cst.stride_x == 0) { - short min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y)); - short min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x)); - short max_ky = (oy - min_sy) / cst.dilation_y; - short max_kx = (ox - min_sx) / cst.dilation_x; - short min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y; - short min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x; + int min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y)); + int min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x)); + int max_ky = (oy - min_sy) / cst.dilation_y; + int max_kx = (ox - min_sx) / cst.dilation_x; + int min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y; + int min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x; auto o_wt = wt + o * cst.input_slice * cst.kernel_size; auto b_in = in + b * cst.input_slice * cst.input_size; @@ -86,25 +86,25 @@ kernel void deconv_depthwise(const device ftype4 *in [[buffer(0)]], constant deconv_constants& cst [[buffer(2)]], const device ftype4 *wt [[buffer(3)]], const device ftype4 *biasTerms [[buffer(4)]], - ushort3 gid [[thread_position_in_grid]]) { + uint3 gid [[thread_position_in_grid]]) { if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return; - float4 result = float4(biasTerms[(short)(gid.z % cst.input_slice)]); + float4 result = float4(biasTerms[(int)(gid.z % cst.input_slice)]); - short oy = (short)gid.y + cst.pad_y; - short ox = (short)gid.x + cst.pad_x; - short max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y); - short max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x); - short min_ky = UP_DIV(oy - max_sy, cst.dilation_y); - short min_kx = UP_DIV(ox - max_sx, cst.dilation_x); + int oy = (int)gid.y + cst.pad_y; + int ox = (int)gid.x + cst.pad_x; + int max_sy = min((cst.input_height - 1) * cst.stride_y, oy / cst.stride_y * cst.stride_y); + int max_sx = min((cst.input_width - 1) * cst.stride_x, ox / cst.stride_x * cst.stride_x); + int min_ky = UP_DIV(oy - max_sy, cst.dilation_y); + int min_kx = UP_DIV(ox - max_sx, cst.dilation_x); if ((oy - min_ky * cst.dilation_y) % cst.stride_y == 0 && (ox - min_kx * cst.dilation_x) % cst.stride_x == 0) { - short min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y)); - short min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x)); - short max_ky = (oy - min_sy) / cst.dilation_y; - short max_kx = (ox - min_sx) / cst.dilation_x; - short min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y; - short min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x; + int min_sy = max(0, ROUND_UP(oy + cst.dilation_y - cst.kernel_y * cst.dilation_y, cst.stride_y)); + int min_sx = max(0, ROUND_UP(ox + cst.dilation_x - cst.kernel_x * cst.dilation_x, cst.stride_x)); + int max_ky = (oy - min_sy) / cst.dilation_y; + int max_kx = (ox - min_sx) / cst.dilation_x; + int min_iy = (oy - max_ky * cst.dilation_y) / cst.stride_y; + int min_ix = (ox - max_kx * cst.dilation_x) / cst.stride_x; auto z_wt = wt + (int)gid.z * cst.kernel_size; auto z_in = in + (int)gid.z * cst.input_size; diff --git a/source/backend/metal/MetalGridSample.hpp b/source/backend/metal/MetalGridSample.hpp new file mode 100644 index 00000000..f8b5ccc0 --- /dev/null +++ b/source/backend/metal/MetalGridSample.hpp @@ -0,0 +1,38 @@ +// +// MetalGridSample.hpp +// MNN +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef MetalGridSample_hpp +#define MetalGridSample_hpp + +#import "core/Execution.hpp" +#import "MNN_generated.h" +#import "MetalBackend.hpp" + +#if MNN_METAL_ENABLED +namespace MNN { + +class MetalGridSample : public Execution { +public: + MetalGridSample(Backend *backend, const GridSample* gridSample); + virtual ~MetalGridSample() = default; + virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; + +private: + id mParams; + id mPipeline; + std::pair mThreads; + + SampleMode mMode; + BorderMode mPaddingMode; + bool mAlignCorners; +}; + +} // namespace MNN +#endif /* MNN_METAL_ENABLED */ +#endif /* MetalGridSample_hpp */ diff --git a/source/backend/metal/MetalGridSample.metal b/source/backend/metal/MetalGridSample.metal new file mode 100644 index 00000000..cdefc288 --- /dev/null +++ b/source/backend/metal/MetalGridSample.metal @@ -0,0 +1,120 @@ +// +// MetalGridSample.metal +// MNN +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include +#include "MetalDefine.metal" + +using namespace metal; + +struct grid_sample_params { + int batches; + int channels; + int inH; + int inW; + int outH; + int outW; + int mode; // 0-Bilinear, 1-Nearest + int paddingMode; // 0-Zeros, 1-Border, 2-Reflection + int alignCorners; +}; + +static float getPosition(float x, int range, int alignCorners, int paddingMode) { + if (paddingMode == 2/*GridSamplePaddingMode_REFLECTION*/) { + // if x is on the left side of -1.0, move it to the right side of 1.0 + if (x < -1.0f) { + x = x + ::ceil(1 - x) * 4; + } + // reflect + if (x > 1.0f) { + float l = x - 1.0f; + int reflectionNum = ::floor(l / 2.0); + float offset = l - reflectionNum * 2.0f; + x = (reflectionNum % 2 == 0) ? (1 - offset) : (-1.0f + offset); + } + } + + float a = alignCorners ? 1.0f : 0.0f; + float b = alignCorners ? 0.0f : 1.0f; + return ((1 + x) * (range - a) - b) / 2.0f; +} + +static int CLAMP(int v, int min, int max) { + if ((v) < min) { + (v) = min; + } else if ((v) > max) { + (v) = max; + } + return v; +} + +static ftype4 sample(int h, int w, const device ftype4 *buffer, int height, int width, int paddingMode) { + if (h < 0 || h >= height || w < 0 || w >= width) { + if (paddingMode == 0/*GridSamplePaddingMode_ZEROS*/) { + return 0.0f; + } + // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER + // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1), + // the leftover reflections degrade to GridSamplePaddingMode_BORDER + h = CLAMP(h, 0, height - 1); + w = CLAMP(w, 0, width - 1); + } + + return buffer[h * width + w]; +} + +static ftype4 interpolate(float h, float w, const device ftype4 *buffer, int height, int width, int mode, + int paddingMode) { + if (mode == 1/*GridSampleMode_NEAREST*/) { + int nh = ::floor(h+0.5f); + int nw = ::floor(w+0.5f); + return sample(nh, nw, buffer, height, width, paddingMode); + } + + // mode == GridSampleMode_BILINEAR + int w0_h = ::floor(h); + int w0_w = ::floor(w); + int w1_h = w0_h + 1; + int w1_w = w0_w + 1; + ftype4 oneV = (ftype4)((ftype)1.0f); + + ftype4 i00 = sample(w0_h, w0_w, buffer, height, width, paddingMode); + ftype4 i01 = sample(w0_h, w1_w, buffer, height, width, paddingMode); + ftype4 i10 = sample(w1_h, w0_w, buffer, height, width, paddingMode); + ftype4 i11 = sample(w1_h, w1_w, buffer, height, width, paddingMode); + + + ftype4 f0 = (ftype4)((ftype)(w1_w - w)); + ftype4 f1 = oneV - f0; + ftype4 h0 = (ftype4)((ftype)(w1_h - h)); + ftype4 h1 = oneV - h0; + + ftype4 i0 = i00 * f0 + i01 * f1; + ftype4 i1 = i10 * f0 + i11 * f1; + + return i0 * h0 + i1 * h1; +} + +kernel void grid_sample(const device ftype4 *input [[buffer(0)]], + const device ftype *grid [[buffer(1)]], + device ftype4 *output [[buffer(2)]], + constant grid_sample_params &p [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + if ((int)gid.x >= p.outW || (int)gid.y >= p.outH || (int)gid.z >= p.batches) + return; + + int gridPos = gid.z*p.outH*p.outW*2 + gid.y*p.outW*2 + gid.x*2; + auto x = getPosition(grid[gridPos+0], p.inW, p.alignCorners, p.paddingMode); + auto y = getPosition(grid[gridPos+1], p.inH, p.alignCorners, p.paddingMode); + + const int channelC4 = (p.channels + 3) / 4; + for (int c = 0; c < channelC4; ++ c) { + auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x; + auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW; + output[outputPos] = interpolate(y, x, inputPtr, p.inH, p.inW, p.mode, p.paddingMode); + } +} diff --git a/source/backend/metal/MetalGridSample.mm b/source/backend/metal/MetalGridSample.mm new file mode 100644 index 00000000..73f15876 --- /dev/null +++ b/source/backend/metal/MetalGridSample.mm @@ -0,0 +1,79 @@ +// +// MetalGridSample.mm +// MNN +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#import "backend/metal/MetalGridSample.hpp" +#import "backend/metal/MNNMetalContext.h" + +#if MNN_METAL_ENABLED +namespace MNN { + +MetalGridSample::MetalGridSample(Backend *backend, const GridSample *gridSample) + : Execution(backend) { + mMode = gridSample->mode(); + mPaddingMode = gridSample->paddingMode(); + mAlignCorners = gridSample->alignCorners(); + + auto metal_backend = static_cast(this->backend()); + auto context = (__bridge MNNMetalContext *)metal_backend->context(); + mParams = [context newDeviceBuffer:9*sizeof(int) access:CPUWriteOnly]; +} + +ErrorCode MetalGridSample::onResize(const std::vector &inputs, + const std::vector &outputs) { + auto inputTensor = inputs[0]; + auto outputTensor = outputs[0]; + + ((int *)mParams.contents)[0] = inputTensor->batch();//inputTensor->buffer().dim[0].extent; // batches + ((int *)mParams.contents)[1] = inputTensor->channel();//->buffer().dim[1].extent; // channels + ((int *)mParams.contents)[2] = inputTensor->height();//buffer().dim[2].extent; // inH + ((int *)mParams.contents)[3] = inputTensor->width();//buffer().dim[3].extent; // inW + ((int *)mParams.contents)[4] = outputTensor->height();//->buffer().dim[2].extent; // outH + ((int *)mParams.contents)[5] = outputTensor->width();//->buffer().dim[3].extent; // outW + ((int *)mParams.contents)[6] = mMode; + ((int *)mParams.contents)[7] = mPaddingMode; + ((int *)mParams.contents)[8] = mAlignCorners; + + auto backend = static_cast(this->backend()); + auto context = (__bridge MNNMetalContext *)backend->context(); + mPipeline = [context pipelineWithName:@"grid_sample"]; + + int batches = ((int *)mParams.contents)[0]; + int channels = ((int *)mParams.contents)[1]; + int outH = ((int *)mParams.contents)[4]; + int outW = ((int *)mParams.contents)[5]; + mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outW, outH, batches)]; + + //printf("re:%d %d %d, %d %d %d, %d %d\n", mThreads.first.width, mThreads.first.height, mThreads.first.depth, mThreads.second.width, mThreads.second.height, mThreads.second.depth, ((int *)mParams.contents)[3], ((int *)mParams.contents)[2]); + return NO_ERROR; +} + +ErrorCode MetalGridSample::onExecute(const std::vector &inputs, const std::vector &outputs) { + auto backend = static_cast(this->backend()); + + auto encoder = backend->encoder(); + [encoder setComputePipelineState:mPipeline]; + [encoder setBuffer:(__bridge id ) (void *) inputs[0]->deviceId() offset:0 atIndex:0]; + [encoder setBuffer:(__bridge id ) (void *) inputs[1]->deviceId() offset:0 atIndex:1]; + [encoder setBuffer:(__bridge id ) (void *) outputs[0]->deviceId() offset:0 atIndex:2]; + [encoder setBuffer:mParams offset:0 atIndex:3]; + [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second]; + + return NO_ERROR; +} + +class MetalGridSampleCreator : public MetalBackend::Creator { +public: + virtual Execution *onCreate(const std::vector &inputs, const MNN::Op *op, + Backend *backend) const override { + return new MetalGridSample(backend, op->main_as_GridSample()); + } +}; + +REGISTER_METAL_OP_CREATOR(MetalGridSampleCreator, OpType_GridSample); +} // namespace MNN +#endif /* MNN_METAL_ENABLED */ diff --git a/source/backend/metal/MetalMatMul.metal b/source/backend/metal/MetalMatMul.metal index cb992af2..feb73acd 100644 --- a/source/backend/metal/MetalMatMul.metal +++ b/source/backend/metal/MetalMatMul.metal @@ -21,7 +21,7 @@ kernel void matmul(const device ftype *in0 [[buffer(0)]], device ftype *out [[buffer(2)]], constant matmul_shape &s [[buffer(3)]], uint2 gid[[thread_position_in_grid]]) { - if ((int)gid.x < s.mat_size.x || (int)gid.y < s.mat_size.y) { + if ((int)gid.x < s.mat_size.x && (int)gid.y < s.mat_size.y) { auto off_in0 = in0 + int(gid.y) * s.in_stride.x; auto off_in1 = in1 + int(gid.x) * s.in_stride.z; float value = 0.f; @@ -38,7 +38,7 @@ kernel void matmul_bias(const device ftype *in0 [[buffer(0)]], device ftype *out [[buffer(3)]], constant matmul_shape &s [[buffer(4)]], uint2 gid[[thread_position_in_grid]]) { - if ((int)gid.x < s.mat_size.x || (int)gid.y < s.mat_size.y) { + if ((int)gid.x < s.mat_size.x && (int)gid.y < s.mat_size.y) { auto off_in0 = in0 + int(gid.y) * s.in_stride.x; auto off_in1 = in1 + int(gid.x) * s.in_stride.z; float value = 0.f; diff --git a/source/backend/metal/MetalOPRegister.mm b/source/backend/metal/MetalOPRegister.mm index 33929457..7b1c6ba2 100644 --- a/source/backend/metal/MetalOPRegister.mm +++ b/source/backend/metal/MetalOPRegister.mm @@ -15,6 +15,7 @@ extern void ___MetalPoolingCreator__OpType_Pooling__(); extern void ___MetalScaleCreator__OpType_Scale__(); extern void ___MetalInterpCreator__OpType_Interp__(); + extern void ___MetalGridSampleCreator__OpType_GridSample__(); extern void ___MetalUnaryCreator__OpType_UnaryOp__(); extern void ___MetalUnaryCreator__OpType_TanH__(); extern void ___MetalUnaryCreator__OpType_Sigmoid__(); @@ -37,6 +38,7 @@ void registerMetalOps() { ___MetalPoolingCreator__OpType_Pooling__(); ___MetalScaleCreator__OpType_Scale__(); ___MetalInterpCreator__OpType_Interp__(); + ___MetalGridSampleCreator__OpType_GridSample__(); ___MetalUnaryCreator__OpType_UnaryOp__(); ___MetalUnaryCreator__OpType_TanH__(); ___MetalUnaryCreator__OpType_Sigmoid__(); diff --git a/source/backend/metal/MetalRaster.mm b/source/backend/metal/MetalRaster.mm index 00407949..36cc8670 100644 --- a/source/backend/metal/MetalRaster.mm +++ b/source/backend/metal/MetalRaster.mm @@ -71,20 +71,20 @@ ErrorCode MetalRaster::onResize(const std::vector &inputs, const std:: NSString* kernelName = nil; switch (bytes) { case 4: - kernelName = @"blit_int32x4"; + kernelName = @"blit_intx4"; break; case 2: kernelName = @"blit_int64"; break; case 1: - kernelName = @"blit_int32"; + kernelName = @"blit_int"; break; default: break; } if (outputs[0]->getType().code == halide_type_float) { #if MNN_METAL_FULL_PRECISION - kernelName = @"blit_int32x4"; + kernelName = @"blit_intx4"; #else kernelName = @"blit_int64"; #endif @@ -162,7 +162,7 @@ ErrorCode MetalRaster::onResize(const std::vector &inputs, const std:: } if (outputs[0]->getType().code == halide_type_float) { #if MNN_METAL_FULL_PRECISION - kernelName = @"blit_int32"; + kernelName = @"blit_int"; #else kernelName = @"blit_int16"; #endif diff --git a/source/backend/metal/MetalUnary.metal b/source/backend/metal/MetalUnary.metal index 5d50054d..83977a8a 100644 --- a/source/backend/metal/MetalUnary.metal +++ b/source/backend/metal/MetalUnary.metal @@ -23,6 +23,9 @@ static inline float4 expm1(float4 value) {return exp(value) - 1;} static inline float4 reciprocal(float4 value) {return 1.0/(value);} static inline float4 sigmoid(float4 value) {return 1.f / (1.f + exp(-value));} static inline float4 log1p(float4 value) {return log(1.f + value);} +static inline float4 hardswish(float4 value) { + return (float4)(1.0/6.0) * (value * min(max(value+(float4)3, 0), (float4)6)); +} #define define_op(op) \ kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \ @@ -62,4 +65,5 @@ define_op(acosh); define_op(asinh); define_op(atanh); define_op(round); +define_op(hardswish); diff --git a/source/backend/metal/MetalUnary.mm b/source/backend/metal/MetalUnary.mm index 299b46fd..b76f371e 100755 --- a/source/backend/metal/MetalUnary.mm +++ b/source/backend/metal/MetalUnary.mm @@ -46,6 +46,7 @@ static NSString *kernelForType(UnaryOpOperation type) { op_case(SINH, sinh); op_case(ASINH, asinh); op_case(ATANH, atanh); + op_case(HARDSWISH, hardswish); default: FUNC_PRINT_ALL(EnumNameUnaryOpOperation(type), s); return nil; diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp index 93d21c60..c45102c7 100644 --- a/source/backend/opencl/core/OpenCLBackend.cpp +++ b/source/backend/opencl/core/OpenCLBackend.cpp @@ -47,7 +47,8 @@ std::pair CLRuntime::onGetCache() { return mOpenCLRuntime->makeCache(); } -Backend* CLRuntime::onCreate() const { +Backend* CLRuntime::onCreate(const BackendConfig* config) const { + // FIXME: Use config info return new OpenCLBackend(this); } @@ -82,39 +83,6 @@ OpenCLBackend::OpenCLBackend(const CLRuntime *runtime) mStaticBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR)); mImagePool.reset(new ImagePool(mOpenCLRuntime->context())); mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR)); - - #ifndef MNN_OPENCL_BUFFER_CLOSED - if(mOpenCLRuntime->getGpuMemType() == BUFFER) - { - std::set buildOptions; - //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS - //because cpu input and output are fp32 - buildOptions.emplace("-DBUFFER_FORMAT_INP_TRANS"); - mNCHWBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nchw_buffer_to_nc4hw4_buffer", buildOptions); - mNHWCBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nhwc_buffer_to_nc4hw4_buffer", buildOptions); - mNC4HW4BufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions); - - buildOptions.clear(); - buildOptions.emplace("-DBUFFER_FORMAT_OUT_TRANS"); - - mNC4HW4BufferToNHWCBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nhwc_buffer", buildOptions); - mNC4HW4BufferToNCHWBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nchw_buffer", buildOptions); - mNC4HW4BufferToNC4HW4BufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions); - } - else - #endif /* MNN_OPENCL_BUFFER_CLOSED */ - { - std::set buildOptions; - //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS - //because cpu input and output are fp32 - buildOptions.emplace("-DBUFFER_IMAGE_IO_TRANS"); - mNC4HW4BufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nc4hw4_buffer_to_image", buildOptions); - mNCHWBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nchw_buffer_to_image", buildOptions); - mNHWCBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nhwc_buffer_to_image", buildOptions); - mImageToNC4HW4BufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nc4hw4_buffer", buildOptions); - mImageToNHWCBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", buildOptions); - mImageToNCHWBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nchw_buffer", buildOptions); - } } } @@ -266,11 +234,13 @@ Execution* OpenCLBackend::onCreate(const std::vector& inputs, const std auto iter = creators->find(std::make_pair(op->type(), mOpenCLRuntime->getGpuMemType())); if (iter == creators->end()) { + #if 0//close log if (nullptr != op->name()) { MNN_PRINT("Don't support type %s memObject:%d, %s\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType(), op->name()->c_str()); } else { MNN_PRINT("Don't support type %s memObject:%d\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType()); } + #endif return NULL; } @@ -311,6 +281,7 @@ Execution* OpenCLBackend::onCreate(const std::vector& inputs, const std } if (!valid) { + #if 0//close log for (auto t : inputs) { auto tensorShape = OpenCL::tensorShapeFormat(t); MNN_PRINT("input n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]); @@ -320,17 +291,20 @@ Execution* OpenCLBackend::onCreate(const std::vector& inputs, const std MNN_PRINT("output n:%d, h:%d, w:%d, c:%d\n", tensorShape[0], tensorShape[1], tensorShape[2], tensorShape[3]); } MNN_PRINT("beyond cl_image creat size! fallback to cpu backend\n"); + #endif return NULL; } } auto exe = iter->second->onCreate(inputs, outputs, op, this); if (NULL == exe) { + #if 0//close log if (nullptr != op->name()) { MNN_PRINT("The Creator Don't support type %s, memObject:%d, %s\n", MNN::EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType(), op->name()->c_str()); } else { MNN_PRINT("The Creator Don't support type %s, memObject:%d,\n", EnumNameOpType(op->type()), mOpenCLRuntime->getGpuMemType()); } + #endif return NULL; } #ifdef LOG_VERBOSE @@ -340,6 +314,39 @@ Execution* OpenCLBackend::onCreate(const std::vector& inputs, const std } void OpenCLBackend::onResizeBegin() { + #ifndef MNN_OPENCL_BUFFER_CLOSED + if(mOpenCLRuntime->getGpuMemType() == BUFFER) + { + std::set buildOptions; + //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS + //because cpu input and output are fp32 + buildOptions.emplace("-DBUFFER_FORMAT_INP_TRANS"); + mNCHWBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nchw_buffer_to_nc4hw4_buffer", buildOptions); + mNHWCBufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nhwc_buffer_to_nc4hw4_buffer", buildOptions); + mNC4HW4BufferToNC4HW4BufferInp = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions); + + buildOptions.clear(); + buildOptions.emplace("-DBUFFER_FORMAT_OUT_TRANS"); + + mNC4HW4BufferToNHWCBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nhwc_buffer", buildOptions); + mNC4HW4BufferToNCHWBufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nchw_buffer", buildOptions); + mNC4HW4BufferToNC4HW4BufferOut = mOpenCLRuntime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nc4hw4_buffer", buildOptions); + } + else + #endif /* MNN_OPENCL_BUFFER_CLOSED */ + { + std::set buildOptions; + //when input or output need buffer2image transformation, open macro BUFFER_IMAGE_IO_TRANS + //because cpu input and output are fp32 + buildOptions.emplace("-DBUFFER_IMAGE_IO_TRANS"); + mNC4HW4BufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nc4hw4_buffer_to_image", buildOptions); + mNCHWBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nchw_buffer_to_image", buildOptions); + mNHWCBufferToImageFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "nhwc_buffer_to_image", buildOptions); + mImageToNC4HW4BufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nc4hw4_buffer", buildOptions); + mImageToNHWCBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", buildOptions); + mImageToNCHWBufferFloat = mOpenCLRuntime->buildKernel("buffer_to_image", "image_to_nchw_buffer", buildOptions); + } + mOpenCLRuntime->setCommandQueueProfileEnable(); } diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp index 9ba576d9..0274ef69 100644 --- a/source/backend/opencl/core/OpenCLBackend.hpp +++ b/source/backend/opencl/core/OpenCLBackend.hpp @@ -74,7 +74,7 @@ public: CLRuntime(const Backend::Info& info); virtual ~CLRuntime(); - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* config) const override; virtual void onGabageCollect(int level) override; virtual std::pair onGetCache() override; virtual bool onSetCache(const void* buffer, size_t size) override; diff --git a/source/backend/opencl/core/OpenCLRunningUtils.cpp b/source/backend/opencl/core/OpenCLRunningUtils.cpp index c7130549..c585239a 100644 --- a/source/backend/opencl/core/OpenCLRunningUtils.cpp +++ b/source/backend/opencl/core/OpenCLRunningUtils.cpp @@ -232,16 +232,31 @@ std::pair, uint32_t> localWS3DDefault(const std::vectorgetCLTuneLevel() == None) { // define not tune method to choose lws - if(runtime->getGpuMemType() == GpuMemObject::IMAGE) { - lws_prefer[0] = 4; - lws_prefer[1] = 4; - lws_prefer[2] = 2; - } else { + lws_prefer[0] = 0; + lws_prefer[1] = 0; + lws_prefer[2] = 0; + min_cost = 0; + } + + if(runtime->getCLTuneLevel() != None) { + cl::Event event; + cl_int res = runtime->commandQueue().enqueueNDRangeKernel( + mKernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NullRange, + nullptr, &event); + MNN_CHECK_CL_SUCCESS(res, kernelName.c_str()); + if (res != CL_SUCCESS) { + MNN_PRINT("3D lws null res %s\n", kernelName.c_str()); + } + + int cost_time = (int)runtime->getCostTime(&event); + if(cost_time < min_cost) { lws_prefer[0] = 0; lws_prefer[1] = 0; lws_prefer[2] = 0; + min_cost = cost_time; } - min_cost = 0; } if (tunedLws.find(info) == tunedLws.end()) { @@ -413,16 +428,31 @@ std::pair, uint32_t> localWS2DDefault(const std::vectorgetCLTuneLevel() == None) { // define not tune method to choose lws - if(runtime->getGpuMemType() == GpuMemObject::IMAGE) { - lws_prefer[0] = 4; - lws_prefer[1] = 4; - } else { - lws_prefer[0] = 0; - lws_prefer[1] = 0; - } + lws_prefer[0] = 0; + lws_prefer[1] = 0; min_cost = 0; } + if(runtime->getCLTuneLevel() != None) { + cl::Event event; + cl_int res = runtime->commandQueue().enqueueNDRangeKernel( + mKernel, cl::NullRange, + cl::NDRange(gws[0], gws[1]), + cl::NullRange, + nullptr, &event); + MNN_CHECK_CL_SUCCESS(res, kernelName.c_str()); + if (res != CL_SUCCESS) { + MNN_PRINT("2D lws null res %s\n", kernelName.c_str()); + } + + int cost_time = (int)runtime->getCostTime(&event); + if(cost_time < min_cost) { + lws_prefer[0] = 0; + lws_prefer[1] = 0; + min_cost = cost_time; + } + } + if (tunedLws.find(info) == tunedLws.end()) { //printf("2dLocalWS %d Insert! gws:%d %d, lws:%d %d\n", (int)tunedLws.size(), gws[0], gws[1], lws_prefer[0], lws_prefer[1]); tunedLws.insert(std::make_pair(info, std::make_pair(lws_prefer, min_cost))); @@ -447,11 +477,11 @@ void run3DKernelDefault(const ::cl::Kernel &kernel, const std::vector if(lws[0]==0 || lws[1]==0 || lws[2]==0){ res = runtime->commandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), - cl::NullRange); + cl::NullRange, nullptr, eventPtr); }else{ res = runtime->commandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1], internalGlobalWS[2]), - cl::NDRange(lws[0], lws[1], lws[2])); + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, eventPtr); } MNN_CHECK_CL_SUCCESS(res, "run3d"); @@ -486,7 +516,7 @@ void runKernel2D(const ::cl::Kernel &kernel, const std::vector &gws, c cl_int res = CL_SUCCESS; if(lws[0]==0 || lws[1]==0){ res = runtime->commandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange); + kernel, cl::NullRange, cl::NDRange(internalGlobalWS[0], internalGlobalWS[1]), cl::NullRange, nullptr, eventPtr); }else{ res = runtime->commandQueue().enqueueNDRangeKernel( diff --git a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp index f7a01d0c..8c2a0764 100644 --- a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp @@ -111,22 +111,42 @@ public: MNN_ASSERT(inputs.size() > 1); switch (op->main_as_BinaryOp()->opType()) { + case BinaryOpOperation_MUL: + return new BinaryBufExecution(inputs, "in0*in1", op, backend); case BinaryOpOperation_ADD: return new BinaryBufExecution(inputs, "in0+in1", op, backend); case BinaryOpOperation_SUB: return new BinaryBufExecution(inputs, "in0-in1", op, backend); - case BinaryOpOperation_MUL: - return new BinaryBufExecution(inputs, "in0*in1", op, backend); - case BinaryOpOperation_POW: - return new BinaryBufExecution(inputs, "pow(in0,in1)", op, backend); - case BinaryOpOperation_DIV: - return new BinaryBufExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend); - case BinaryOpOperation_MAXIMUM: - return new BinaryBufExecution(inputs, "in0>in1?in0:in1", op, backend); - case BinaryOpOperation_MINIMUM: - return new BinaryBufExecution(inputs, "in0>in1?in1:in0", op, backend); case BinaryOpOperation_REALDIV: return new BinaryBufExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend); + case BinaryOpOperation_MINIMUM: + return new BinaryBufExecution(inputs, "in0>in1?in1:in0", op, backend); + case BinaryOpOperation_MAXIMUM: + return new BinaryBufExecution(inputs, "in0>in1?in0:in1", op, backend); + case BinaryOpOperation_GREATER: + return new BinaryBufExecution(inputs, "convert_float4(isgreater(in0,in1))", op, backend); + case BinaryOpOperation_LESS: + return new BinaryBufExecution(inputs, "convert_float4(isless(in0,in1))", op, backend); + case BinaryOpOperation_LESS_EQUAL: + return new BinaryBufExecution(inputs, "convert_float4(islessequal(in0,in1))", op, backend); + case BinaryOpOperation_GREATER_EQUAL: + return new BinaryBufExecution(inputs, "convert_float4(isgreaterequal(in0,in1))", op, backend); + case BinaryOpOperation_EQUAL: + return new BinaryBufExecution(inputs, "convert_float4(isequal(in0,in1))", op, backend); + case BinaryOpOperation_FLOORDIV: + return new BinaryBufExecution(inputs, "floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend); + case BinaryOpOperation_FLOORMOD: + return new BinaryBufExecution(inputs, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend); + case BinaryOpOperation_POW: + return new BinaryBufExecution(inputs, "pow(in0,in1)", op, backend); + case BinaryOpOperation_SquaredDifference: + return new BinaryBufExecution(inputs, "(in0-in1)*(in0-in1)", op, backend); + case BinaryOpOperation_ATAN2: + return new BinaryBufExecution(inputs, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend); + case BinaryOpOperation_NOTEQUAL: + return new BinaryBufExecution(inputs, "convert_float4(isnotequal(in0,in1))", op, backend); + case BinaryOpOperation_MOD: + return new BinaryBufExecution(inputs, "in0-sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend); default: break; } diff --git a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp index a55baad0..b8e56438 100644 --- a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp @@ -89,53 +89,71 @@ public: const MNN::Op* op, Backend* backend) const override { if (op->type() == OpType_UnaryOp) { switch (op->main_as_UnaryOp()->opType()) { + case UnaryOpOperation_ABS: + return new UnaryBufExecution("fabs(convert_float4(in))", backend); case UnaryOpOperation_SQUARE: return new UnaryBufExecution("in*in", backend); - case UnaryOpOperation_ERF: + case UnaryOpOperation_RSQRT: + return new UnaryBufExecution("rsqrt(convert_float4(in))", backend); + case UnaryOpOperation_NEG: + return new UnaryBufExecution("-(in)", backend); + case UnaryOpOperation_EXP: + return new UnaryBufExecution("exp(convert_float4(in))", backend); + case UnaryOpOperation_COS: + return new UnaryBufExecution("cos(convert_float4(in))", backend); + case UnaryOpOperation_SIN: + return new UnaryBufExecution("sin(convert_float4(in))", backend); + case UnaryOpOperation_TAN: + return new UnaryBufExecution("tan(convert_float4(in))", backend); + case UnaryOpOperation_ATAN: + return new UnaryBufExecution("atan(convert_float4(in))", backend); + case UnaryOpOperation_SQRT: + return new UnaryBufExecution("sqrt(convert_float4(in))", backend); + case UnaryOpOperation_CEIL: + return new UnaryBufExecution("ceil(convert_float4(in))", backend); + case UnaryOpOperation_RECIPROCAL: + return new UnaryBufExecution("native_recip(convert_float4(in))", backend); + case UnaryOpOperation_LOG1P: + return new UnaryBufExecution("log1p(convert_float4(in))", backend); + case UnaryOpOperation_LOG: + return new UnaryBufExecution("native_log(convert_float4(in)>(float4)(0.0000001)?convert_float4(in):(float4)(0.0000001))", backend); + case UnaryOpOperation_FLOOR: + return new UnaryBufExecution("floor(convert_float4(in))", backend); + case UnaryOpOperation_BNLL: + return new UnaryBufExecution("in>(FLOAT4)((FLOAT)0)?(in+native_log(exp(convert_float4(-(in)))+(float4)(1.0))):(native_log(exp(convert_float4(in))+(float4)(1.0)))", backend); + case UnaryOpOperation_ACOSH: + return new UnaryBufExecution("acosh(convert_float4(in))", backend); + case UnaryOpOperation_SINH: + return new UnaryBufExecution("sinh(convert_float4(in))", backend); + case UnaryOpOperation_ASINH: + return new UnaryBufExecution("asinh(convert_float4(in))", backend); + case UnaryOpOperation_ATANH: + return new UnaryBufExecution("atanh(convert_float4(in))", backend); + case UnaryOpOperation_SIGN: + return new UnaryBufExecution("sign(convert_float4(in))", backend); + case UnaryOpOperation_ROUND: + return new UnaryBufExecution("round(convert_float4(in))", backend); + case UnaryOpOperation_COSH: + return new UnaryBufExecution("cosh(convert_float4(in))", backend); + case UnaryOpOperation_ERF: return new UnaryBufExecution("erf(convert_float4(in))", backend); case UnaryOpOperation_ERFC: return new UnaryBufExecution("erfc(convert_float4(in))", backend); - case UnaryOpOperation_SQRT: - return new UnaryBufExecution("sqrt(convert_float4(in))", backend); - case UnaryOpOperation_RSQRT: - return new UnaryBufExecution("rsqrt(convert_float4(in))", backend); - case UnaryOpOperation_ABS: - return new UnaryBufExecution("fabs(convert_float4(in))", backend); - case UnaryOpOperation_SIN: - return new UnaryBufExecution("sin(convert_float4(in))", backend); - case UnaryOpOperation_COS: - return new UnaryBufExecution("cos(convert_float4(in))", backend); - case UnaryOpOperation_SIGN: - return new UnaryBufExecution("sign(convert_float4(in))", backend); - case UnaryOpOperation_EXP: - return new UnaryBufExecution("exp(convert_float4(in))", backend); - case UnaryOpOperation_NEG: - return new UnaryBufExecution("-(in)", backend); - case UnaryOpOperation_TAN: - return new UnaryBufExecution("tan(convert_float4(in))", backend); - case UnaryOpOperation_CEIL: - return new UnaryBufExecution("ceil(convert_float4(in))", backend); - case UnaryOpOperation_LOG1P: - return new UnaryBufExecution("log1p(convert_float4(in))", backend); - case UnaryOpOperation_FLOOR: - return new UnaryBufExecution("floor(convert_float4(in))", backend); - case UnaryOpOperation_ROUND: - return new UnaryBufExecution("round(convert_float4(in))", backend); + case UnaryOpOperation_EXPM1: + return new UnaryBufExecution("expm1(convert_float4(in))", backend); case UnaryOpOperation_SIGMOID: return new UnaryBufExecution("native_recip((float4)1+native_exp(convert_float4(-in)))", backend); case UnaryOpOperation_TANH: return new UnaryBufExecution("tanh(convert_float4(in))", backend); - case UnaryOpOperation_RECIPROCAL: - return new UnaryBufExecution("native_recip(convert_float4(in))", backend); - case UnaryOpOperation_LOG: - return new UnaryBufExecution("native_log(convert_float4(in+(FLOAT4)((FLOAT)0.0000001)))", backend); + case UnaryOpOperation_HARDSWISH: + return new UnaryBufExecution("in>(FLOAT4)((FLOAT)-3)?(in<(FLOAT4)((FLOAT)3)?((convert_float4(in)*(convert_float4(in)+(float4)3.0))/(float4)6.0):convert_float4(in)):(float4)(0.0)", backend); default: break; } return nullptr; } if (op->type() == OpType_Sigmoid) { - return new UnaryBufExecution("native_recip((float4)(1)+native_exp(convert_float4(-in)))", backend); + return new UnaryBufExecution("native_recip((float4)(1.0)+native_exp(convert_float4(-(in))))", backend); } if (op->type() == OpType_TanH) { return new UnaryBufExecution("tanh(convert_float4(in))", backend); diff --git a/source/backend/opencl/execution/cl/binary.cl b/source/backend/opencl/execution/cl/binary.cl index 07b5a370..cf45391d 100644 --- a/source/backend/opencl/execution/cl/binary.cl +++ b/source/backend/opencl/execution/cl/binary.cl @@ -30,7 +30,8 @@ __kernel void binary_same_channel_broadcast(__read_only image2d_t input0, __read (int2)(nhwc.w*whInput1.x, nhwc.x*whOutput.y+nhwc.y); } in1 = RI_F(input1, SAMPLER, pos1); - WI_F(output, pos, OPERATOR); + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); + WI_F(output, pos, out); } __kernel void binary_1toM_channel_broadcast_on_awh(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, @@ -57,7 +58,8 @@ __kernel void binary_1toM_channel_broadcast_on_awh(__read_only image2d_t input0, pos1 = (int2)(nhwc.w*whOutput.x+nhwc.z, nhwc.x*whOutput.y+nhwc.y); } in1 = RI_F(input1, SAMPLER, pos1); - WI_F(output, pos, OPERATOR); + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); + WI_F(output, pos, out); } __kernel void binary_1toM_channel_broadcast_on_1wh(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, @@ -92,7 +94,8 @@ __kernel void binary_1toM_channel_broadcast_on_1wh(__read_only image2d_t input0, (int2)(nhwc.w * whInput1.x, nhwc.x * whOutput.y + nhwc.y); } in1 = RI_F(input1, SAMPLER, pos1); - WI_F(output, pos, OPERATOR); + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); + WI_F(output, pos, out); } __kernel void binary(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, @@ -104,7 +107,8 @@ __kernel void binary(__read_only image2d_t input0, __read_only image2d_t input1, int2 pos1 = (int2)(nhwc1.w*whInput1.x+nhwc1.z, nhwc1.x*whInput1.y+nhwc1.y); FLOAT4 in0 = RI_F(input0, SAMPLER, pos); FLOAT4 in1 = RI_F(input1, SAMPLER, pos1); - WI_F(output, pos, OPERATOR); + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); + WI_F(output, pos, out); } } @@ -118,7 +122,8 @@ __kernel void binary_value(__read_only image2d_t input0, __read_only image2d_t i const FLOAT input1Data = RI_F(input1, SAMPLER, (int2)(0, 0)).x; FLOAT4 in0 = RI_F(input0, SAMPLER, pos); FLOAT4 in1 = (FLOAT4)(input1Data); - WI_F(output, pos, OPERATOR); + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); + WI_F(output, pos, out); } } diff --git a/source/backend/opencl/execution/cl/binary_buf.cl b/source/backend/opencl/execution/cl/binary_buf.cl index da879b9f..1bfec6c9 100644 --- a/source/backend/opencl/execution/cl/binary_buf.cl +++ b/source/backend/opencl/execution/cl/binary_buf.cl @@ -18,7 +18,7 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1, if(isFull.y == 0) { in1 = (FLOAT4)(in1.x, in1.x, in1.x, in1.x); } - FLOAT4 out = OPERATOR; + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); vstore4(out, offset, output); } } @@ -34,6 +34,7 @@ __kernel void prelu_buf(__private int global_dim0, __private int global_dim1, int offset = pos.x * (shape.y*shape.z) + pos.y; FLOAT4 in0 = vload4(offset, input0); FLOAT4 in1 = vload4(pos.x % shape.w, input1); - vstore4(OPERATOR, offset, output); + FLOAT4 out = CONVERT_FLOAT4(OPERATOR); + vstore4(out, offset, output); } } diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc index c613e329..39c4ead7 100644 --- a/source/backend/opencl/execution/cl/opencl_program.cc +++ b/source/backend/opencl/execution/cl/opencl_program.cc @@ -22,7 +22,7 @@ extern const std::map> OpenCLProgramMap }, { "binary_buf", - { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x69,0x73,0x46,0x75,0x6c,0x6c,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x6f,0x75,0x74,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x70,0x72,0x65,0x6c,0x75,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x25,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa, } + { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x69,0x73,0x46,0x75,0x6c,0x6c,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2a,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x30,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x69,0x73,0x46,0x75,0x6c,0x6c,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x30,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x2c,0x20,0x69,0x6e,0x31,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x6f,0x75,0x74,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x70,0x72,0x65,0x6c,0x75,0x5f,0x62,0x75,0x66,0x28,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2f,0x2f,0x5b,0x4e,0x2c,0x48,0x2c,0x57,0x2c,0x43,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0x2f,0x2f,0x4e,0x43,0x34,0x2c,0x20,0x48,0x57,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x30,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3c,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x64,0x69,0x6d,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x6f,0x73,0x2e,0x78,0x20,0x2a,0x20,0x28,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2a,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x20,0x2b,0x20,0x70,0x6f,0x73,0x2e,0x79,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x25,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x6f,0x75,0x74,0x2c,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa, } }, { "unary", @@ -42,7 +42,7 @@ extern const std::map> OpenCLProgramMap }, { "binary", - { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x5f,0x5f,0x63,0x6f,0x6e,0x73,0x74,0x61,0x6e,0x74,0x20,0x73,0x61,0x6d,0x70,0x6c,0x65,0x72,0x5f,0x74,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x20,0x3d,0x20,0x43,0x4c,0x4b,0x5f,0x4e,0x4f,0x52,0x4d,0x41,0x4c,0x49,0x5a,0x45,0x44,0x5f,0x43,0x4f,0x4f,0x52,0x44,0x53,0x5f,0x46,0x41,0x4c,0x53,0x45,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x41,0x44,0x44,0x52,0x45,0x53,0x53,0x5f,0x43,0x4c,0x41,0x4d,0x50,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x46,0x49,0x4c,0x54,0x45,0x52,0x5f,0x4e,0x45,0x41,0x52,0x45,0x53,0x54,0x3b,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x73,0x61,0x6d,0x65,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x61,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x31,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x76,0x61,0x6c,0x75,0x65,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x29,0x2e,0x78,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x69,0x6d,0x61,0x67,0x65,0x43,0x6f,0x70,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x64,0x69,0x6d,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x69,0x6d,0x61,0x67,0x65,0x5f,0x64,0x69,0x6d,0x28,0x69,0x6e,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x78,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x79,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x29,0x3b,0xa,0x7d,0xa, } + { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x5f,0x5f,0x63,0x6f,0x6e,0x73,0x74,0x61,0x6e,0x74,0x20,0x73,0x61,0x6d,0x70,0x6c,0x65,0x72,0x5f,0x74,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x20,0x3d,0x20,0x43,0x4c,0x4b,0x5f,0x4e,0x4f,0x52,0x4d,0x41,0x4c,0x49,0x5a,0x45,0x44,0x5f,0x43,0x4f,0x4f,0x52,0x44,0x53,0x5f,0x46,0x41,0x4c,0x53,0x45,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x41,0x44,0x44,0x52,0x45,0x53,0x53,0x5f,0x43,0x4c,0x41,0x4d,0x50,0x20,0x7c,0x20,0x43,0x4c,0x4b,0x5f,0x46,0x49,0x4c,0x54,0x45,0x52,0x5f,0x4e,0x45,0x41,0x52,0x45,0x53,0x54,0x3b,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x73,0x61,0x6d,0x65,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x61,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x31,0x74,0x6f,0x4d,0x5f,0x63,0x68,0x61,0x6e,0x6e,0x65,0x6c,0x5f,0x62,0x72,0x6f,0x61,0x64,0x63,0x61,0x73,0x74,0x5f,0x6f,0x6e,0x5f,0x31,0x77,0x68,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3e,0x3d,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x2c,0x20,0x69,0x6e,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x30,0x2c,0x20,0x70,0x6f,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x20,0x26,0x26,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x78,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x77,0x69,0x64,0x74,0x68,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x2a,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0x20,0x65,0x6c,0x73,0x65,0x20,0x69,0x66,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x30,0x2e,0x79,0x20,0x3d,0x3d,0x20,0x31,0x29,0x20,0x7b,0x20,0x2f,0x2f,0x20,0x54,0x65,0x6e,0x73,0x6f,0x72,0x20,0x30,0x20,0x68,0x65,0x69,0x67,0x68,0x74,0x20,0x6c,0x65,0x6e,0x67,0x74,0x68,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x30,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x76,0x61,0x6c,0x75,0x65,0x2e,0x78,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x20,0x21,0x3d,0x20,0x31,0x29,0x20,0x3f,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x78,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x20,0x3a,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x2a,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2c,0x20,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x2a,0x20,0x77,0x68,0x4f,0x75,0x74,0x70,0x75,0x74,0x2e,0x79,0x20,0x2b,0x20,0x6e,0x68,0x77,0x63,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x62,0x69,0x6e,0x61,0x72,0x79,0x5f,0x76,0x61,0x6c,0x75,0x65,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x73,0x68,0x61,0x70,0x65,0x2c,0x20,0x69,0x6e,0x74,0x32,0x20,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x74,0x34,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x34,0x29,0x28,0x70,0x6f,0x73,0x2e,0x79,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x79,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x79,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x25,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x2c,0x20,0x70,0x6f,0x73,0x2e,0x78,0x2f,0x73,0x68,0x61,0x70,0x65,0x2e,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x6e,0x68,0x77,0x63,0x2e,0x78,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x78,0x20,0x26,0x26,0x20,0x6e,0x68,0x77,0x63,0x2e,0x77,0x20,0x3c,0x20,0x73,0x68,0x61,0x70,0x65,0x2e,0x77,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x34,0x20,0x6e,0x68,0x77,0x63,0x31,0x20,0x3d,0x20,0x6e,0x68,0x77,0x63,0x20,0x2a,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x4e,0x48,0x57,0x43,0x53,0x74,0x65,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x31,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x6e,0x68,0x77,0x63,0x31,0x2e,0x77,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x78,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x7a,0x2c,0x20,0x6e,0x68,0x77,0x63,0x31,0x2e,0x78,0x2a,0x77,0x68,0x49,0x6e,0x70,0x75,0x74,0x31,0x2e,0x79,0x2b,0x6e,0x68,0x77,0x63,0x31,0x2e,0x79,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x30,0x2c,0x20,0x30,0x29,0x29,0x2e,0x78,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x30,0x20,0x3d,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x69,0x6e,0x31,0x20,0x3d,0x20,0x28,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x44,0x61,0x74,0x61,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x4f,0x50,0x45,0x52,0x41,0x54,0x4f,0x52,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x69,0x6d,0x61,0x67,0x65,0x43,0x6f,0x70,0x79,0x28,0x5f,0x5f,0x72,0x65,0x61,0x64,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x5f,0x5f,0x77,0x72,0x69,0x74,0x65,0x5f,0x6f,0x6e,0x6c,0x79,0x20,0x69,0x6d,0x61,0x67,0x65,0x32,0x64,0x5f,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x70,0x6f,0x73,0x20,0x3d,0x20,0x28,0x69,0x6e,0x74,0x32,0x29,0x28,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x2c,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x32,0x20,0x64,0x69,0x6d,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x69,0x6d,0x61,0x67,0x65,0x5f,0x64,0x69,0x6d,0x28,0x69,0x6e,0x70,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x70,0x6f,0x73,0x2e,0x78,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x78,0x20,0x26,0x26,0x20,0x70,0x6f,0x73,0x2e,0x79,0x20,0x3e,0x3d,0x20,0x64,0x69,0x6d,0x2e,0x79,0x29,0x20,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x57,0x49,0x5f,0x46,0x28,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x70,0x6f,0x73,0x2c,0x20,0x52,0x49,0x5f,0x46,0x28,0x69,0x6e,0x70,0x75,0x74,0x2c,0x20,0x53,0x41,0x4d,0x50,0x4c,0x45,0x52,0x2c,0x20,0x70,0x6f,0x73,0x29,0x29,0x3b,0xa,0x7d,0xa, } }, { "scale", diff --git a/source/backend/opencl/execution/image/EltwiseExecution.cpp b/source/backend/opencl/execution/image/EltwiseExecution.cpp index 8f3454ef..3ed45166 100644 --- a/source/backend/opencl/execution/image/EltwiseExecution.cpp +++ b/source/backend/opencl/execution/image/EltwiseExecution.cpp @@ -210,22 +210,42 @@ public: MNN_ASSERT(inputs.size() > 1); switch (op->main_as_BinaryOp()->opType()) { + case BinaryOpOperation_MUL: + return new EltwiseExecution(inputs, "in0*in1", op, backend); case BinaryOpOperation_ADD: return new EltwiseExecution(inputs, "in0+in1", op, backend); case BinaryOpOperation_SUB: return new EltwiseExecution(inputs, "in0-in1", op, backend); - case BinaryOpOperation_MUL: - return new EltwiseExecution(inputs, "in0*in1", op, backend); - case BinaryOpOperation_POW: - return new EltwiseExecution(inputs, "pow(in0,in1)", op, backend); - case BinaryOpOperation_DIV: - return new EltwiseExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend); - case BinaryOpOperation_MAXIMUM: - return new EltwiseExecution(inputs, "in0>in1?in0:in1", op, backend); - case BinaryOpOperation_MINIMUM: - return new EltwiseExecution(inputs, "in0>in1?in1:in0", op, backend); case BinaryOpOperation_REALDIV: return new EltwiseExecution(inputs, "sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend); + case BinaryOpOperation_MINIMUM: + return new EltwiseExecution(inputs, "in0>in1?in1:in0", op, backend); + case BinaryOpOperation_MAXIMUM: + return new EltwiseExecution(inputs, "in0>in1?in0:in1", op, backend); + case BinaryOpOperation_GREATER: + return new EltwiseExecution(inputs, "convert_float4(isgreater(in0,in1))", op, backend); + case BinaryOpOperation_LESS: + return new EltwiseExecution(inputs, "convert_float4(isless(in0,in1))", op, backend); + case BinaryOpOperation_LESS_EQUAL: + return new EltwiseExecution(inputs, "convert_float4(islessequal(in0,in1))", op, backend); + case BinaryOpOperation_GREATER_EQUAL: + return new EltwiseExecution(inputs, "convert_float4(isgreaterequal(in0,in1))", op, backend); + case BinaryOpOperation_EQUAL: + return new EltwiseExecution(inputs, "convert_float4(isequal(in0,in1))", op, backend); + case BinaryOpOperation_FLOORDIV: + return new EltwiseExecution(inputs, "floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend); + case BinaryOpOperation_FLOORMOD: + return new EltwiseExecution(inputs, "in0-floor(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))*in1", op, backend); + case BinaryOpOperation_POW: + return new EltwiseExecution(inputs, "pow(in0,in1)", op, backend); + case BinaryOpOperation_SquaredDifference: + return new EltwiseExecution(inputs, "(in0-in1)*(in0-in1)", op, backend); + case BinaryOpOperation_ATAN2: + return new EltwiseExecution(inputs, "atan(sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001)))", op, backend); + case BinaryOpOperation_NOTEQUAL: + return new EltwiseExecution(inputs, "convert_float4(isnotequal(in0,in1))", op, backend); + case BinaryOpOperation_MOD: + return new EltwiseExecution(inputs, "in0-sign(in1)*in0/(fabs(in1)>(FLOAT4)((FLOAT)0.0000001)?fabs(in1):(FLOAT4)((FLOAT)0.0000001))", op, backend); default: break; } diff --git a/source/backend/opencl/execution/image/UnaryExecution.cpp b/source/backend/opencl/execution/image/UnaryExecution.cpp index 248ed33a..a442528c 100644 --- a/source/backend/opencl/execution/image/UnaryExecution.cpp +++ b/source/backend/opencl/execution/image/UnaryExecution.cpp @@ -88,53 +88,71 @@ public: const MNN::Op* op, Backend* backend) const override { if (op->type() == OpType_UnaryOp) { switch (op->main_as_UnaryOp()->opType()) { + case UnaryOpOperation_ABS: + return new UnaryExecution("fabs(convert_float4(in))", backend); case UnaryOpOperation_SQUARE: return new UnaryExecution("in*in", backend); - case UnaryOpOperation_ERF: + case UnaryOpOperation_RSQRT: + return new UnaryExecution("rsqrt(convert_float4(in))", backend); + case UnaryOpOperation_NEG: + return new UnaryExecution("-(in)", backend); + case UnaryOpOperation_EXP: + return new UnaryExecution("exp(convert_float4(in))", backend); + case UnaryOpOperation_COS: + return new UnaryExecution("cos(convert_float4(in))", backend); + case UnaryOpOperation_SIN: + return new UnaryExecution("sin(convert_float4(in))", backend); + case UnaryOpOperation_TAN: + return new UnaryExecution("tan(convert_float4(in))", backend); + case UnaryOpOperation_ATAN: + return new UnaryExecution("atan(convert_float4(in))", backend); + case UnaryOpOperation_SQRT: + return new UnaryExecution("sqrt(convert_float4(in))", backend); + case UnaryOpOperation_CEIL: + return new UnaryExecution("ceil(convert_float4(in))", backend); + case UnaryOpOperation_RECIPROCAL: + return new UnaryExecution("native_recip(convert_float4(in))", backend); + case UnaryOpOperation_LOG1P: + return new UnaryExecution("log1p(convert_float4(in))", backend); + case UnaryOpOperation_LOG: + return new UnaryExecution("native_log(convert_float4(in)>(float4)(0.0000001)?convert_float4(in):(float4)(0.0000001))", backend); + case UnaryOpOperation_FLOOR: + return new UnaryExecution("floor(convert_float4(in))", backend); + case UnaryOpOperation_BNLL: + return new UnaryExecution("in>(FLOAT4)((FLOAT)0)?(in+native_log(exp(convert_float4(-(in)))+(float4)(1.0))):(native_log(exp(convert_float4(in))+(float4)(1.0)))", backend); + case UnaryOpOperation_ACOSH: + return new UnaryExecution("acosh(convert_float4(in))", backend); + case UnaryOpOperation_SINH: + return new UnaryExecution("sinh(convert_float4(in))", backend); + case UnaryOpOperation_ASINH: + return new UnaryExecution("asinh(convert_float4(in))", backend); + case UnaryOpOperation_ATANH: + return new UnaryExecution("atanh(convert_float4(in))", backend); + case UnaryOpOperation_SIGN: + return new UnaryExecution("sign(convert_float4(in))", backend); + case UnaryOpOperation_ROUND: + return new UnaryExecution("round(convert_float4(in))", backend); + case UnaryOpOperation_COSH: + return new UnaryExecution("cosh(convert_float4(in))", backend); + case UnaryOpOperation_ERF: return new UnaryExecution("erf(convert_float4(in))", backend); case UnaryOpOperation_ERFC: return new UnaryExecution("erfc(convert_float4(in))", backend); - case UnaryOpOperation_SQRT: - return new UnaryExecution("sqrt(convert_float4(in))", backend); - case UnaryOpOperation_RSQRT: - return new UnaryExecution("rsqrt(convert_float4(in))", backend); - case UnaryOpOperation_ABS: - return new UnaryExecution("fabs(convert_float4(in))", backend); - case UnaryOpOperation_SIN: - return new UnaryExecution("sin(convert_float4(in))", backend); - case UnaryOpOperation_COS: - return new UnaryExecution("cos(convert_float4(in))", backend); - case UnaryOpOperation_SIGN: - return new UnaryExecution("sign(convert_float4(in))", backend); - case UnaryOpOperation_EXP: - return new UnaryExecution("exp(convert_float4(in))", backend); - case UnaryOpOperation_NEG: - return new UnaryExecution("-(in)", backend); - case UnaryOpOperation_TAN: - return new UnaryExecution("tan(convert_float4(in))", backend); - case UnaryOpOperation_CEIL: - return new UnaryExecution("ceil(convert_float4(in))", backend); - case UnaryOpOperation_LOG1P: - return new UnaryExecution("log1p(convert_float4(in))", backend); - case UnaryOpOperation_FLOOR: - return new UnaryExecution("floor(convert_float4(in))", backend); - case UnaryOpOperation_ROUND: - return new UnaryExecution("round(convert_float4(in))", backend); + case UnaryOpOperation_EXPM1: + return new UnaryExecution("expm1(convert_float4(in))", backend); case UnaryOpOperation_SIGMOID: return new UnaryExecution("native_recip((float4)1+native_exp(convert_float4(-in)))", backend); case UnaryOpOperation_TANH: return new UnaryExecution("tanh(convert_float4(in))", backend); - case UnaryOpOperation_RECIPROCAL: - return new UnaryExecution("native_recip(convert_float4(in))", backend); - case UnaryOpOperation_LOG: - return new UnaryExecution("native_log(convert_float4(in+(FLOAT4)((FLOAT)0.0000001)))", backend); + case UnaryOpOperation_HARDSWISH: + return new UnaryExecution("in>(FLOAT4)((FLOAT)-3)?(in<(FLOAT4)((FLOAT)3)?((convert_float4(in)*(convert_float4(in)+(float4)3.0))/(float4)6.0):convert_float4(in)):(float4)(0.0)", backend); default: break; } return nullptr; } if (op->type() == OpType_Sigmoid) { - return new UnaryExecution("native_recip((float4)(1)+native_exp(convert_float4(-in)))", backend); + return new UnaryExecution("native_recip((float4)(1.0)+native_exp(convert_float4(-(in))))", backend); } if (op->type() == OpType_TanH) { return new UnaryExecution("tanh(convert_float4(in))", backend); diff --git a/source/backend/opengl/GLBackend.cpp b/source/backend/opengl/GLBackend.cpp index b408bae9..3d840460 100644 --- a/source/backend/opengl/GLBackend.cpp +++ b/source/backend/opengl/GLBackend.cpp @@ -424,7 +424,7 @@ bool GLBackend::isCreateError() const { return mIsCreateError; } -Backend* GLRuntime::onCreate() const { +Backend* GLRuntime::onCreate(const BackendConfig* config) const { BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal; BackendConfig::PowerMode power = BackendConfig::Power_Normal; if (nullptr != mInfo.user) { @@ -443,7 +443,7 @@ class GLRuntimeCreator : public RuntimeCreator { public: virtual Runtime *onCreate(const Backend::Info &info) const override { auto rt = new GLRuntime(info); - auto bn = (GLBackend*)rt->onCreate(); + auto bn = (GLBackend*)(rt->onCreate(nullptr)); if (bn->isCreateError()) { delete bn; delete rt; diff --git a/source/backend/opengl/GLBackend.hpp b/source/backend/opengl/GLBackend.hpp index e3270184..feea2497 100644 --- a/source/backend/opengl/GLBackend.hpp +++ b/source/backend/opengl/GLBackend.hpp @@ -35,7 +35,7 @@ public: @brief create backend @return created backend */ - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* config) const override; /** @brief clear unuseful resource diff --git a/source/backend/tensorrt/backend/TRTBackend.cpp b/source/backend/tensorrt/backend/TRTBackend.cpp index 135e62d1..94a5af25 100755 --- a/source/backend/tensorrt/backend/TRTBackend.cpp +++ b/source/backend/tensorrt/backend/TRTBackend.cpp @@ -13,6 +13,7 @@ #include #include #include +// #define MNN_OPEN_TIME_TRACE #include #include #include @@ -53,7 +54,7 @@ TRTRuntime::TRTRuntime(const Backend::Info& info) { TRTRuntime::~TRTRuntime() { } -Backend* TRTRuntime::onCreate() const { +Backend* TRTRuntime::onCreate(const BackendConfig* config) const { return new TRTBackend(this); } @@ -209,7 +210,6 @@ bool TRTBackend::onAcquireBuffer(const Tensor* tensor, StorageType storageType) auto type = tensor->getType(); auto trtType = nvinfer1::DataType::kFLOAT; dims.nbDims = shape.size(); - ::memcpy(dims.d, shape.data(), dims.nbDims * sizeof(int32_t)); auto input = mNetwork->addInput(name, trtType, dims); mTensorMaps[tensor].first = input; @@ -231,6 +231,23 @@ bool TRTBackend::onClearBuffer() { return true; } +template +void NHWC2NCHW(const T* source, T* dest, int b, int c, int area) { + int sourceBatchsize = c * area; + int destBatchSize = sourceBatchsize; + for (int bi = 0; bi < b; ++bi) { + auto srcBatch = source + bi * sourceBatchsize; + auto dstBatch = dest + bi * destBatchSize; + for (int i = 0; i < area; ++i) { + auto srcArea = srcBatch + i * c; + auto dstArea = dstBatch + i; + for (int ci = 0; ci < c; ++ci) { + dstArea[ci * area] = srcArea[ci]; + } + } + } +} + void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const { bool isConst = (TensorUtils::getDescribe(srcTensor)->usage == Tensor::InsideDescribe::Usage::CONSTANT || TensorUtils::getDescribe(dstTensor)->usage == Tensor::InsideDescribe::Usage::CONSTANT); @@ -248,6 +265,7 @@ void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) auto totalSize = srcTensor->elementSize(); std::shared_ptr common(new ConvolutionCommon::Int8Common); common->weightFloat.reset(totalSize); + // trtType = nvinfer1::DataType::kFLOAT; auto dstFloat = common->weightFloat.get(); if (type == halide_type_of()) { auto src = srcTensor->host(); @@ -266,7 +284,6 @@ void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) } } TRTWeight weight{trtType, static_cast(common->weightFloat.get()), static_cast(totalSize)}; - auto const_layer = mNetwork->addConstant(dims, weight.get()); mTensorMaps[dstTensor].first = const_layer->getOutput(0); pushCache(common); @@ -284,15 +301,49 @@ void TRTBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) printf("TRTBackend onCopyBuffer in %d, outIdx:%d\n", index_++, output_index); #endif + AUTOTIME; auto isInputCopy = TensorUtils::getDescribe(dstTensor)->usage == Tensor::InsideDescribe::Usage::INPUT; if (isInputCopy) { - shared_ptr tmpTensor(new Tensor(dstTensor, Tensor::DimensionType::CAFFE, true)); // nchw - tensorConvert(srcTensor, tmpTensor.get()); + MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat; auto inputIndex = mContext->getEngine().getBindingIndex(mInputs[dstTensor].first.c_str()); - auto status = cudaMemcpy(mInOutbuffers[inputIndex], tmpTensor->host(), tmpTensor->size(), cudaMemcpyHostToDevice); - MNN_ASSERT(0 == status); + if(data_format == Tensor::DimensionType::CAFFE){ + auto type = srcTensor->getType(); + if (type == halide_type_of()) { + auto totalSize = srcTensor->elementSize(); + for (int v = 0; v < totalSize; ++v) { + srcTensor->host()[v] = float(srcTensor->host()[v]); + } + }else if(type == halide_type_of()){ + auto totalSize = srcTensor->elementSize(); + for (int v = 0; v < totalSize; ++v) { + srcTensor->host()[v] = float(srcTensor->host()[v]); + } + } + auto status = cudaMemcpy(mInOutbuffers[inputIndex], srcTensor->host(), srcTensor->size(), cudaMemcpyHostToDevice); + MNN_ASSERT(0 == status); + }else{ + int area = dstTensor->height() * dstTensor->width(); + int b = dstTensor->batch(); + int c = dstTensor->channel(); + shared_ptr tmpTensor(new Tensor(dstTensor, Tensor::DimensionType::CAFFE, true)); // nchw + NHWC2NCHW(tmpTensor->host(), srcTensor->host(), b, c, area); + auto type = tmpTensor->getType(); + if (type == halide_type_of()) { + auto totalSize = tmpTensor->elementSize(); + for (int v = 0; v < totalSize; ++v) { + tmpTensor->host()[v] = float(tmpTensor->host()[v]); + } + }else if(type == halide_type_of()){ + auto totalSize = tmpTensor->elementSize(); + for (int v = 0; v < totalSize; ++v) { + tmpTensor->host()[v] = float(tmpTensor->host()[v]); + } + } + auto status = cudaMemcpy(mInOutbuffers[inputIndex], tmpTensor->host(), tmpTensor->size(), cudaMemcpyHostToDevice); + MNN_ASSERT(0 == status); + } } else { - shared_ptr tmpTensor(new Tensor(srcTensor, srcTensor->getDimensionType(), true)); // nchw + shared_ptr tmpTensor(new Tensor(srcTensor, srcTensor->getDimensionType(), true)); MNN_ASSERT(dstTensor->host() != nullptr); auto outputIndex = mContext->getEngine().getBindingIndex(mOutputs[srcTensor].first.c_str()); auto status = cudaMemcpy(tmpTensor->host(), mInOutbuffers[outputIndex], tmpTensor->size(), cudaMemcpyDeviceToHost); @@ -336,6 +387,7 @@ void TRTBackend::onResizeEnd() { } auto cudaEngine = mBuilder->buildCudaEngine(*mNetwork); MNN_ASSERT(cudaEngine != nullptr); + IHostMemory* model = cudaEngine->serialize(); if (mEngine == nullptr) { @@ -432,3 +484,4 @@ static bool gResistor = []() { return false; }(); } // namespace MNN + diff --git a/source/backend/tensorrt/backend/TRTBackend.hpp b/source/backend/tensorrt/backend/TRTBackend.hpp index 359f9438..b90711f6 100644 --- a/source/backend/tensorrt/backend/TRTBackend.hpp +++ b/source/backend/tensorrt/backend/TRTBackend.hpp @@ -33,7 +33,7 @@ public: TRTRuntime(const Backend::Info& info); virtual ~TRTRuntime(); - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* config) const override; virtual void onGabageCollect(int level) override; // If buffer is not nullptr, try copy cache, else delete cache virtual bool onSetCache(const void* buffer, size_t size) override { diff --git a/source/backend/tensorrt/execution/TRTBatchMatMul.cpp b/source/backend/tensorrt/execution/TRTBatchMatMul.cpp index 6a2188b8..5cbeb4df 100755 --- a/source/backend/tensorrt/execution/TRTBatchMatMul.cpp +++ b/source/backend/tensorrt/execution/TRTBatchMatMul.cpp @@ -2,7 +2,7 @@ // TRTBatchMatMul.cpp // MNN // -// Created by MNN on 2019/09/11. +// Created by MNN on 2021/02/28. // Copyright © 2018, Alibaba Group Holding Limited // @@ -33,23 +33,17 @@ std::vector TRTBatchMatMul::onEncode(const std::vector &xO auto param = mOp->main_as_BatchMatMulParam(); MNN_ASSERT(mInputs.size() == 2); bool isConst0 = TensorUtils::getDescribe(mInputs[0])->usage == Tensor::InsideDescribe::Usage::CONSTANT; - bool isConst1 = TensorUtils::getDescribe(mInputs[0])->usage == Tensor::InsideDescribe::Usage::CONSTANT; + bool isConst1 = TensorUtils::getDescribe(mInputs[1])->usage == Tensor::InsideDescribe::Usage::CONSTANT; auto dimSize0 = mInputs[0]->dimensions(); auto dimSize1 = mInputs[1]->dimensions(); -//hangxing TODO: not same dimension, add addShuffle to broadcast dim - // MNN_ASSERT(dimSize0 == dimSize1); - // for (size_t i = 0; i < dimSize0; i++){ - // MNN_PRINT("dim0 : %d , dim1 : %d \n", mInputs[0]->length(i), mInputs[1]->length(i)); - // MNN_ASSERT(mInputs[0]->length(i) == mInputs[1]->length(i)); - // } - auto transpose_a = transposeFormat(xOp[0], param->adjX()); auto transpose_b = transposeFormat(xOp[1], param->adjY()); auto matmul_layer = mTrtBackend->getNetwork()->addMatrixMultiply(*xOp[0], transpose_a, *xOp[1], transpose_b); return {matmul_layer->getOutput(0)}; + } TRTCreatorRegister> __batch_matmul_op(OpType_BatchMatMul); diff --git a/source/backend/tensorrt/execution/TRTBatchMatMul.hpp b/source/backend/tensorrt/execution/TRTBatchMatMul.hpp index 31b8f622..f1a82855 100755 --- a/source/backend/tensorrt/execution/TRTBatchMatMul.hpp +++ b/source/backend/tensorrt/execution/TRTBatchMatMul.hpp @@ -2,7 +2,7 @@ // TRTBatchMatMul.hpp // MNN // -// Created by MNN on 2019/09/11. +// Created by MNN on 2021/02/28. // Copyright © 2018, Alibaba Group Holding Limited // diff --git a/source/backend/tensorrt/execution/TRTCast.cpp b/source/backend/tensorrt/execution/TRTCast.cpp index a96f0a95..5334c03e 100755 --- a/source/backend/tensorrt/execution/TRTCast.cpp +++ b/source/backend/tensorrt/execution/TRTCast.cpp @@ -32,7 +32,7 @@ std::vector TRTCast::onEncode(const std::vector &xOp) { onehotp->outerSize = mInputs[0]->elementSize(); - if(srcT == DataType_DT_INT32 && dstT == DataType_DT_FLOAT){ + if((srcT == DataType_DT_INT32 || srcT == DataType_DT_INT64) && dstT == DataType_DT_FLOAT){ auto interpPlugin = (nvinfer1::IPluginExt *)MNNTRTCreatePlugion(mOp, plu.get()); nvinfer1::IPluginLayer *plugin = mTrtBackend->getNetwork()->addPluginExt(&xOp[0], 1, *((nvinfer1::IPluginExt *)interpPlugin)); if (plugin == nullptr) { diff --git a/source/backend/tensorrt/execution/TRTCommonExecution.cpp b/source/backend/tensorrt/execution/TRTCommonExecution.cpp index 2a5cc021..1edaea7f 100644 --- a/source/backend/tensorrt/execution/TRTCommonExecution.cpp +++ b/source/backend/tensorrt/execution/TRTCommonExecution.cpp @@ -17,7 +17,9 @@ TRTCommonExecution::TRTCommonExecution(Backend *backend, const Op *op) : Executi ErrorCode TRTCommonExecution::onResize(const std::vector &inputs, const std::vector &outputs) { mInputs = inputs; mOutputs = outputs; - // MNN_PRINT("layer info: Type:%s name:%s \n", EnumNameOpType(mOp->type()), mOp->name()->c_str()); + // if(mOp->name() != nullptr){ + // MNN_PRINT("layer info: Type:%s name:%s \n", EnumNameOpType(mOp->type()), mOp->name()->c_str()); + // } // MNN_PRINT(" =========== layer info: Type:%s =========== \n", EnumNameOpType(mOp->type())); std::vector nvTensors(inputs.size()); for (int i = 0; i < inputs.size(); ++i) { @@ -48,7 +50,7 @@ ErrorCode TRTCommonExecution::onResize(const std::vector &inputs, cons // printf("%d ", out_dims.d[i]); // } // printf("\n"); - // for(int i = 0; i < out_dims.nbDims; i++){ + // for(int i = 0; i < outputs[0]->dimensions(); i++){ // printf("%d ", outputs[0]->shape()[i]); // } // printf("\n"); diff --git a/source/backend/tensorrt/execution/TRTLayerNorm.cpp b/source/backend/tensorrt/execution/TRTLayerNorm.cpp new file mode 100755 index 00000000..0b8142be --- /dev/null +++ b/source/backend/tensorrt/execution/TRTLayerNorm.cpp @@ -0,0 +1,74 @@ +// +// TRTLayerNorm.cpp +// MNN +// +// Created by MNN on 2021/02/08. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "TRTLayerNorm.hpp" +#include +#include "TRTBackend.hpp" +#include "schema/current/MNNPlugin_generated.h" + +using namespace std; + +namespace MNN { + +TRTLayerNorm::TRTLayerNorm(Backend *b, const Op *op, const std::vector &inputs, + const std::vector &outputs) + : MNN::TRTCommonExecution(b, op) { +} + +std::vector TRTLayerNorm::onEncode(const std::vector &xOp) { +#ifdef TRT_LOG + printf("TRTLayerNorm in\n"); +#endif + + auto plu = createPluginWithOutput(mOutputs); + + const auto* layer_norm_param = mOp->main_as_LayerNorm(); + int axis_size = layer_norm_param->axis()->size(); + std::vector axis_; + axis_.resize(axis_size); + for (int i = 0; i < axis_size; ++i) { + axis_[i] = layer_norm_param->axis()->Get(i); + } + + int outter_size_ = 1; + int inner_size_ = 1; + int rank = mInputs[0]->dimensions(); + std::vector axis(axis_.size()); + for (int i = 0; i < axis_.size(); ++i) { + if (axis_[i] < 0) { + axis[i] += rank; + } + } + std::sort(axis.begin(), axis.end()); + for (int i = 0; i < rank - axis.size(); ++i) { + outter_size_ *= mInputs[0]->length(i); + } + for (int i = rank - axis.size(); i < rank; ++i) { + inner_size_ *= mInputs[0]->length(i); + } + + plu->main.type = MNNTRTPlugin::Parameter_OneHotInfo; + plu->main.value = new MNNTRTPlugin::OneHotInfoT; + auto onehotp = plu->main.AsOneHotInfo(); + + onehotp->outerSize = outter_size_; + onehotp->innerSize = inner_size_; + + auto interpPlugin = (nvinfer1::IPluginExt *)MNNTRTCreatePlugion(mOp, plu.get()); + nvinfer1::IPluginLayer *plugin = mTrtBackend->getNetwork()->addPluginExt(&xOp[0], mInputs.size(), *((nvinfer1::IPluginExt *)interpPlugin)); + if (plugin == nullptr) { + printf("Interp plugin == nullptr !!!\n"); + } + mTrtBackend->pushReleaseLayer(interpPlugin); + return {plugin->getOutput(0)}; + +} + +TRTCreatorRegister> __layer_norm_op(OpType_LayerNorm); + +} // namespace MNN diff --git a/source/backend/tensorrt/execution/TRTLayerNorm.hpp b/source/backend/tensorrt/execution/TRTLayerNorm.hpp new file mode 100755 index 00000000..cf9fc284 --- /dev/null +++ b/source/backend/tensorrt/execution/TRTLayerNorm.hpp @@ -0,0 +1,28 @@ +// +// TRTLayerNorm.hpp +// MNN +// +// Created by MNN on 2021/02/08. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef MNN_TRTLayerNorm_HPP +#define MNN_TRTLayerNorm_HPP + +#include "TRTBackend.hpp" +#include "TRTCommonExecution.hpp" + +namespace MNN { + +class TRTLayerNorm : public TRTCommonExecution { +public: + TRTLayerNorm(Backend *b, const Op *op, const std::vector &inputs, const std::vector &outputs); + virtual ~TRTLayerNorm() = default; + virtual std::vector onEncode(const std::vector &inputs) override; +private: + int mAxis; +}; + +} // namespace MNN + +#endif // MNN_TRTLayerNorm_HPP diff --git a/source/backend/tensorrt/execution/TRTUnary.cpp b/source/backend/tensorrt/execution/TRTUnary.cpp index 6dc94d2e..4f5efc45 100644 --- a/source/backend/tensorrt/execution/TRTUnary.cpp +++ b/source/backend/tensorrt/execution/TRTUnary.cpp @@ -151,6 +151,18 @@ std::vector TRTUnary::onEncode(const std::vector &xOp) { case UnaryOpOperation_ACOS: operation = UnaryOperation::kACOS; break; + case UnaryOpOperation_HARDSWISH: + { + auto plu = createPluginWithOutput(mOutputs); + auto signPlugin = (nvinfer1::IPluginExt *)MNNTRTCreatePlugion(mOp, plu.get()); + nvinfer1::IPluginLayer *plugin = + mTrtBackend->getNetwork()->addPluginExt(&xOp[0], 1, *((nvinfer1::IPluginExt *)signPlugin)); + if (plugin == nullptr) { + printf("plugin == nullptr !!!"); + } + mTrtBackend->pushReleaseLayer(signPlugin); + return {plugin->getOutput(0)}; + } default: MNN_PRINT("unary not support this type : %d \n", mOp->main_as_UnaryOp()->opType()); MNN_ASSERT(false); diff --git a/source/backend/tensorrt/execution/plugin/CastPlugin.cpp b/source/backend/tensorrt/execution/plugin/CastPlugin.cpp index 6427a637..7e4ed398 100755 --- a/source/backend/tensorrt/execution/plugin/CastPlugin.cpp +++ b/source/backend/tensorrt/execution/plugin/CastPlugin.cpp @@ -19,10 +19,16 @@ CastPlugin::~CastPlugin() { } int CastPlugin::onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) { - const int* bottom_data = reinterpret_cast(inputs[0]); - float* top_data = reinterpret_cast(outputs[0]); - return CastInt32ToFloatExecute(dataType, mCount, bottom_data, top_data, stream); + int size = 0; + if (dataType == nvinfer1::DataType::kFLOAT){ + size = mCount*sizeof(float); + }else{ + size = mCount*sizeof(__half); + } + + auto status = cudaMemcpy(outputs[0], inputs[0], size, cudaMemcpyDeviceToDevice); + MNN_ASSERT(0 == status); } }; // namespace MNN diff --git a/source/backend/tensorrt/execution/plugin/CastPlugin.cu b/source/backend/tensorrt/execution/plugin/CastPlugin.cu deleted file mode 100755 index 05eb37eb..00000000 --- a/source/backend/tensorrt/execution/plugin/CastPlugin.cu +++ /dev/null @@ -1,21 +0,0 @@ - -#include "CastPlugin.hpp" - -namespace MNN { - -__global__ void cast_int_to_float(const int n, const int* in, float* out) { - CUDA_KERNEL_LOOP(index, n) { - int data = in[index]; - out[index] = (float)data; - } -} - -cudaError_t CastPlugin::CastInt32ToFloatExecute(nvinfer1::DataType dataType, const int count, const int* bottom_data, - float* top_data, cudaStream_t stream) { - - cast_int_to_float<<>>(count, bottom_data, top_data); - - return cudaPeekAtLastError(); -} - -}; // namespace MNN diff --git a/source/backend/tensorrt/execution/plugin/CastPlugin.hpp b/source/backend/tensorrt/execution/plugin/CastPlugin.hpp index 24a6a802..527859a7 100755 --- a/source/backend/tensorrt/execution/plugin/CastPlugin.hpp +++ b/source/backend/tensorrt/execution/plugin/CastPlugin.hpp @@ -19,8 +19,6 @@ public: virtual ~CastPlugin(); virtual int onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) override; - cudaError_t CastInt32ToFloatExecute(nvinfer1::DataType dataType, const int count, const int* bottom_data, - float* top_data, cudaStream_t stream); private: int mCount; }; diff --git a/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp b/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp index 2c1fc22d..08107466 100755 --- a/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp +++ b/source/backend/tensorrt/execution/plugin/CommonPlugin.cpp @@ -17,6 +17,7 @@ #include "DetectionPostProcessPlugin.hpp" #include "OneHotPlugin.hpp" #include "CastPlugin.hpp" +#include "LayerNormPlugin.hpp" namespace MNN { @@ -54,6 +55,9 @@ static CommonPlugin::Enqueue* create(const Op* op, const MNNTRTPlugin::Plugin* p if (op->type() == OpType_Cast) { return new CastPlugin(op, plugin); } + if (op->type() == OpType_LayerNorm) { + return new LayerNormPlugin(op, plugin); + } MNN_PRINT("not find plugin type : %d !!! \n"); return nullptr; } diff --git a/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp b/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp index 8af1e291..cbf64170 100644 --- a/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp +++ b/source/backend/tensorrt/execution/plugin/CommonPlugin.hpp @@ -104,9 +104,7 @@ public: } virtual bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override { - // return (type == nvinfer1::DataType::kFLOAT) && format == nvinfer1::PluginFormat::kNCHW; - return true; - return (type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF) && format == nvinfer1::PluginFormat::kNCHW; + return (type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF || type == nvinfer1::DataType::kINT32) && format == nvinfer1::PluginFormat::kNCHW; } virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, diff --git a/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp b/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp index ec498757..b0b90a41 100644 --- a/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp +++ b/source/backend/tensorrt/execution/plugin/GatherPlugin.cpp @@ -27,7 +27,7 @@ GatherPlugin::~GatherPlugin() { int GatherPlugin::onEnqueue(int batchSize, const void *const *inputs, void **outputs, void *, nvinfer1::DataType dataType, cudaStream_t stream) { const float *bottom_data = reinterpret_cast(inputs[0]); - const int *indices = reinterpret_cast(inputs[1]); + const float *indices = reinterpret_cast(inputs[1]); float *top_data = reinterpret_cast(outputs[0]); if(mInput3){ int axis; diff --git a/source/backend/tensorrt/execution/plugin/GatherPlugin.cu b/source/backend/tensorrt/execution/plugin/GatherPlugin.cu index e6a63d4e..6ffb3876 100644 --- a/source/backend/tensorrt/execution/plugin/GatherPlugin.cu +++ b/source/backend/tensorrt/execution/plugin/GatherPlugin.cu @@ -4,7 +4,7 @@ namespace MNN { template __global__ void GATHER(const int count, const int outputOutsideStride, const int inputOutsideStride, const int N, const int limit, int insideStride, - const T *inputPtr, const int* indicesPtr, T *outputPtr) { + const T *inputPtr, const T* indicesPtr, T *outputPtr) { CUDA_KERNEL_LOOP(index, count) { int o = index / (N*insideStride); int o_r = index % (N*insideStride); @@ -12,23 +12,23 @@ __global__ void GATHER(const int count, const int outputOutsideStride, const int int s = o_r % insideStride; int outputIdx = outputOutsideStride * o + i * insideStride + s; - int indices = indicesPtr[i]; + int indices = int(indicesPtr[i]); if (indices < 0 || indices > limit) { outputPtr[outputIdx] = 0.0f; }else{ - int inputIdx = inputOutsideStride * o + insideStride * indicesPtr[i] + s; + int inputIdx = inputOutsideStride * o + insideStride * indices + s; outputPtr[outputIdx] = inputPtr[inputIdx]; } } } -cudaError_t GatherPlugin::GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const int* indices, +cudaError_t GatherPlugin::GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const float* indices, float* top_data, cudaStream_t stream) { if (dataType == nvinfer1::DataType::kFLOAT){ GATHER<<>>(count, mOutputOutsideStride, mInputOutsideStride, mN, mLimit, mInsideStride, bottom_data, indices, top_data); }else{ - GATHER<__half><<>>(count, mOutputOutsideStride, mInputOutsideStride, mN, mLimit, mInsideStride, (const __half*)bottom_data, indices, (__half*)top_data); + GATHER<__half><<>>(count, mOutputOutsideStride, mInputOutsideStride, mN, mLimit, mInsideStride, (const __half*)bottom_data, (const __half*)indices, (__half*)top_data); } return cudaPeekAtLastError(); } diff --git a/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp b/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp index ad258f33..a19fe7b2 100644 --- a/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp +++ b/source/backend/tensorrt/execution/plugin/GatherPlugin.hpp @@ -19,7 +19,7 @@ public: virtual ~GatherPlugin(); virtual int onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) override; - cudaError_t GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const int* indices, float* top_data, cudaStream_t stream); + cudaError_t GatherExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, const float* indices, float* top_data, cudaStream_t stream); private: int mCount; diff --git a/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cpp b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cpp new file mode 100755 index 00000000..9ab5475f --- /dev/null +++ b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cpp @@ -0,0 +1,52 @@ +// +// LayerNormPlugin.cpp +// MNN +// +// Created by MNN on b'2021/02/08'. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "LayerNormPlugin.hpp" +namespace MNN { +LayerNormPlugin::LayerNormPlugin(const Op* op, const MNNTRTPlugin::Plugin* plugin) { + + const auto* layer_norm_param = op->main_as_LayerNorm(); + int axis_size = layer_norm_param->axis()->size(); + mAxis.resize(axis_size); + for (int i = 0; i < axis_size; ++i) { + mAxis[i] = layer_norm_param->axis()->Get(i); + } + mEpsilon = layer_norm_param->epsilon(); + + int size = layer_norm_param->gamma()->size(); + cudaMalloc(&mGamma, size * sizeof(float)); + MNN_ASSERT(nullptr != mGamma); + const float* gamma_data = layer_norm_param->gamma()->data(); + auto status = cudaMemcpy(mGamma, gamma_data, size * sizeof(float), cudaMemcpyHostToDevice); + MNN_ASSERT(0 == status); + + cudaMalloc(&mBeta, size * sizeof(float)); + MNN_ASSERT(nullptr != mBeta); + + const float* beta_data = layer_norm_param->beta()->data(); + status = cudaMemcpy(mBeta, beta_data, size * sizeof(float), cudaMemcpyHostToDevice); + MNN_ASSERT(0 == status); + + auto Info = plugin->main_as_OneHotInfo(); + mOutterSize = Info->outerSize(); + mInnerSize = Info->innerSize(); + +} +LayerNormPlugin::~LayerNormPlugin() { + cudaFree(mBeta); + cudaFree(mGamma); +} +int LayerNormPlugin::onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) { + const float* bottom_data = reinterpret_cast(inputs[0]); + float* top_data = reinterpret_cast(outputs[0]); + + return LayerNormExecute(dataType, mOutterSize, mInnerSize, bottom_data, top_data, (const float*)mGamma, (const float*)mBeta, + stream); +} + +} // namespace MNN \ No newline at end of file diff --git a/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cu b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cu new file mode 100755 index 00000000..17a60fca --- /dev/null +++ b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.cu @@ -0,0 +1,72 @@ +#include "LayerNormPlugin.hpp" +namespace MNN { + +template +__global__ void LayerNorm(const int outter_size_, const int inner_size_, float epsilon_, const T* in, T* out, + const float* gamma, const float* beta); + +template <> +__global__ void LayerNorm(const int outter_size_, const int inner_size_, float epsilon_, const float* in, float* out, + const float* gamma, const float* beta) { + CUDA_KERNEL_LOOP(i, outter_size_) { + int inner_input_index = i * inner_size_; + int inner_output_index = i * inner_size_; + float sum = 0.f; + for (int j = 0; j < inner_size_; ++j) { + sum += in[inner_input_index + j]; + } + float mean = sum / inner_size_; + float square_sum = 0.f; + for (int j = 0; j < inner_size_; ++j) { + square_sum += (in[inner_input_index + j] - mean) * (in[inner_input_index + j] - mean); + } + float variable = square_sum / inner_size_; + variable = 1.f / std::sqrt(variable + epsilon_); + + for (int j = 0; j < inner_size_; ++j) { + out[inner_output_index + j] = (in[inner_input_index + j] - mean) * variable * gamma[j] + beta[j]; + } + } +} + +template <> +__global__ void LayerNorm<__half>(const int outter_size_, const int inner_size_, float epsilon_, const __half* in, __half* out, + const float* gamma, const float* beta) { + CUDA_KERNEL_LOOP(i, outter_size_) { + int inner_input_index = i * inner_size_; + int inner_output_index = i * inner_size_; + float sum = 0.f; + for (int j = 0; j < inner_size_; ++j) { + float data = __half2float(in[inner_input_index + j]); + sum += data; + } + float mean = sum / inner_size_; + float square_sum = 0.f; + for (int j = 0; j < inner_size_; ++j) { + float data = __half2float(in[inner_input_index + j]); + square_sum += (data - mean) * (data - mean); + } + float variable = square_sum / inner_size_; + variable = 1.f / std::sqrt(variable + epsilon_); + + for (int j = 0; j < inner_size_; ++j) { + float data = __half2float(in[inner_input_index + j]); + out[inner_output_index + j] = __float2half((data - mean) * variable * gamma[j] + beta[j]); + } + } +} + +cudaError_t LayerNormPlugin::LayerNormExecute(nvinfer1::DataType dataType, const int outter_size_, const int inner_size_, const float* bottom_data, + float* top_data, const float* gamma, const float* beta, cudaStream_t stream) { + + if (dataType == nvinfer1::DataType::kFLOAT){ + LayerNorm<<>>(outter_size_, inner_size_, mEpsilon, bottom_data, top_data, + gamma, beta); + }else{ + LayerNorm<__half><<>>(outter_size_, inner_size_, mEpsilon, (const __half*)bottom_data, (__half*)top_data, + gamma, beta); + } + + return cudaPeekAtLastError(); +} +}; // namespace MNN \ No newline at end of file diff --git a/source/backend/tensorrt/execution/plugin/LayerNormPlugin.hpp b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.hpp new file mode 100755 index 00000000..1008fd77 --- /dev/null +++ b/source/backend/tensorrt/execution/plugin/LayerNormPlugin.hpp @@ -0,0 +1,33 @@ +// +// LayerNormPlugin.hpp +// MNN +// +// Created by MNN on b'2021/02/08'. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef LayerNormPlugin_hpp +#define LayerNormPlugin_hpp +#include +#include "CommonPlugin.hpp" +namespace MNN { +class LayerNormPlugin : public CommonPlugin::Enqueue { +public: + LayerNormPlugin(const Op* op, const MNNTRTPlugin::Plugin* plugin); + virtual ~LayerNormPlugin(); + virtual int onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, + cudaStream_t stream) override; + cudaError_t LayerNormExecute(nvinfer1::DataType dataType, const int outter_size_, const int inner_size_, const float* bottom_data, + float* top_data, const float* gamma, const float* beta, cudaStream_t stream); + +private: + int mInnerSize = 1; + int mOutterSize = 1; + float mEpsilon = 0.001; + void* mGamma = nullptr; + void* mBeta = nullptr; + std::vector mAxis; +}; +} // namespace MNN + +#endif \ No newline at end of file diff --git a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp index ca6fae21..221a7462 100755 --- a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp +++ b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cpp @@ -21,8 +21,7 @@ OneHotPlugin::~OneHotPlugin() { int OneHotPlugin::onEnqueue(int batchSize, const void* const* inputs, void** outputs, void*, nvinfer1::DataType dataType, cudaStream_t stream) { float* output = reinterpret_cast(outputs[0]); - - auto indices = reinterpret_cast(inputs[0]); + const float* indices = reinterpret_cast(inputs[0]); auto depthTensor = reinterpret_cast(inputs[1]); auto onValueTensor = reinterpret_cast(inputs[2]); auto offValueTensor = reinterpret_cast(inputs[3]); diff --git a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu index 63374a98..0adeadc3 100755 --- a/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu +++ b/source/backend/tensorrt/execution/plugin/OneHotPlugin.cu @@ -2,13 +2,15 @@ namespace MNN { template -__global__ void OneHotImpl(const int n, const float* depthPtr, int innerSize, const float* indices, const T* onValue, +__global__ void OneHotImpl(const int n, const float* depthPtr, int innerSize, const T* indices, const T* onValue, const T* offValue, T* output) { CUDA_KERNEL_LOOP(i, n) { - int depth = (int)depthPtr[0]; - for (int j = 0; j < depth; ++j) { - for (int k = 0; k < innerSize; ++k) { - auto index = indices[i * innerSize + k]; + + int depth = int(depthPtr[0]); + + for (int j = 0; j < depth; j++) { + for (int k = 0; k < innerSize; k++) { + int index = (int)(indices[i * innerSize + k]); int outputIdx = i*depth*innerSize + j*innerSize + k; if (index == j) { output[outputIdx] = onValue[0]; @@ -22,11 +24,14 @@ __global__ void OneHotImpl(const int n, const float* depthPtr, int innerSize, co cudaError_t OneHotPlugin::OneHotExecute(nvinfer1::DataType dataType, const int count, const float* depth, int innerSize, const float* indices, const float* onValueTensor, const float* offValueTensor, float* outputTensor, cudaStream_t stream) { + + if (dataType == nvinfer1::DataType::kFLOAT){ OneHotImpl<<>>(count, depth, innerSize, indices, onValueTensor, offValueTensor, outputTensor); }else{ - OneHotImpl<__half><<>>(count, depth, innerSize, indices, (const __half*)onValueTensor, (const __half*)offValueTensor, (__half*)outputTensor); - } + OneHotImpl<__half><<>>(count, depth, innerSize, (const __half*)indices, (const __half*)onValueTensor, (const __half*)offValueTensor, (__half*)outputTensor); + } + return cudaPeekAtLastError(); } diff --git a/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu b/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu index 6b0b93ec..af1b4c93 100644 --- a/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu +++ b/source/backend/tensorrt/execution/plugin/UnaryPlugin.cu @@ -74,18 +74,46 @@ __device__ T erfcImpl(T x) { } } + template -__global__ void ERF(const int n, const T* in, T* out) { +__global__ void ERF(const int n, const T* in, T* out); + +template <> +__global__ void ERF(const int n, const float* in, float* out) { CUDA_KERNEL_LOOP(index, n) { - if(abs(in[index]) < T(1.)) { - out[index] = erfImpl(in[index]); + if(abs(in[index]) < float(1.)) { + out[index] = erfImpl(in[index]); } else { - out[index] = T(1.) - erfcImpl(in[index]); + out[index] = float(1.) - erfcImpl(in[index]); } } } +template <> +__global__ void ERF<__half>(const int n, const __half* in, __half* out) { + CUDA_KERNEL_LOOP(index, n) { + if(abs(__half2float(in[index])) < float(1.)) { + out[index] = __float2half(erfImpl(__half2float(in[index]))); + } else { + out[index] = __float2half(float(1.) - erfcImpl(__half2float(in[index]))); + } + } +} + +template +__global__ void HARDSWISH(const int n, const T* in, T* out) { + CUDA_KERNEL_LOOP(index, n) { + if(in[index] <= (T)(-3)) { + out[index] = 0; + } else if(in[index] >= (T)3) { + out[index] = in[index]; + } else { + out[index] = in[index] * (in[index] + (T)3) / (T)6; + } + } +} + cudaError_t UnaryPlugin::UnaryExecute(nvinfer1::DataType dataType, const int count, const float* bottom_data, float* top_data, cudaStream_t stream) { if(mType == MNN::UnaryOpOperation_SIGN) { @@ -95,13 +123,18 @@ cudaError_t UnaryPlugin::UnaryExecute(nvinfer1::DataType dataType, const int cou SIGN<__half><<>>(count, (const __half*)bottom_data, (__half*)top_data); } } else if(mType == MNN::UnaryOpOperation_ERF) { - //hangxing TODO , add half support - // if (dataType == nvinfer1::DataType::kFLOAT){ + if (dataType == nvinfer1::DataType::kFLOAT){ ERF<<>>(count, bottom_data, top_data); - // }else{ - // ERF<__half><<>>(count, (const __half*)bottom_data, (__half*)top_data); - // } - }else { + }else{ + ERF<__half><<>>(count, (const __half*)bottom_data, (__half*)top_data); + } + } else if (mType == MNN::UnaryOpOperation_HARDSWISH){ + if (dataType == nvinfer1::DataType::kFLOAT){ + HARDSWISH<<>>(count, bottom_data, top_data); + }else{ + HARDSWISH<__half><<>>(count, (const __half*)bottom_data, (__half*)top_data); + } + } else { printf("Unary Plugin:%d not support\n", mType); } return cudaPeekAtLastError(); diff --git a/source/backend/vulkan/backend/VulkanBackend.cpp b/source/backend/vulkan/backend/VulkanBackend.cpp index ecd5f3cd..6574b2a5 100644 --- a/source/backend/vulkan/backend/VulkanBackend.cpp +++ b/source/backend/vulkan/backend/VulkanBackend.cpp @@ -12,7 +12,7 @@ #include "core/Macro.h" #include #include "core/TensorUtils.hpp" -#include "shape/SizeComputer.hpp" +#include "core/OpCommonUtils.hpp" #include "component/VulkanDevice.hpp" #include "execution/VulkanImageConverter.hpp" #include "component/VulkanInstance.hpp" @@ -61,11 +61,8 @@ std::pair VulkanBackend::onMeasure(const std::vector& inpu if (iter == creator->end()) { return std::make_pair(0.0f, false); } -#ifndef MNN_BUILD_MINI - auto flops = SizeComputer::computeFlops(op, inputs, outputs); -#else + // FIXME: Compute flops auto flops = 0.0f; -#endif const float defaultScheduleCost = 0.001f; return std::make_pair(defaultScheduleCost + flops / 1024.0f / mRuntime->mFlops * 1000.0f, true); } @@ -183,9 +180,8 @@ Execution* VulkanBackend::onCreate(const std::vector& inputs, const std return nullptr; } bool valid = true; -#ifndef MNN_BUILD_MINI for (int i=0; itype(), i)) { + if (!OpCommonUtils::opNeedContent(op->type(), i)) { continue; } auto t = inputs[i]; @@ -207,7 +203,6 @@ Execution* VulkanBackend::onCreate(const std::vector& inputs, const std } } } -#endif for (auto t : outputs) { if (!_supportImageSize(t)) { valid = false; diff --git a/source/backend/vulkan/compiler/AllShader.cpp b/source/backend/vulkan/compiler/AllShader.cpp index 78f6adfd..b771dd6e 100644 --- a/source/backend/vulkan/compiler/AllShader.cpp +++ b/source/backend/vulkan/compiler/AllShader.cpp @@ -1,6 +1,6 @@ #include "../shaders/AllShader.h" const unsigned char glsl_dwweightcopy_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -165,7 +165,7 @@ const unsigned char glsl_dwweightcopy_comp[] = { unsigned int glsl_dwweightcopy_comp_len = 1932; const unsigned char glsl_deconvCol2Im_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -223,6 +223,7 @@ const unsigned char glsl_deconvCol2Im_comp[] = { 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x03, 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x85, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x85, 0x00, 0x00, 0x00, @@ -364,10 +365,10 @@ const unsigned char glsl_deconvCol2Im_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x5b, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvCol2Im_comp_len = 2368; +unsigned int glsl_deconvCol2Im_comp_len = 2380; const unsigned char glsl_convolutionDepthwiseMali_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -637,7 +638,7 @@ const unsigned char glsl_convolutionDepthwiseMali_comp[] = { unsigned int glsl_convolutionDepthwiseMali_comp_len = 3184; const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -913,7 +914,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = { unsigned int glsl_convolutionDepthwiseMali_RELU_comp_len = 3256; const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -1193,7 +1194,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = { unsigned int glsl_convolutionDepthwiseMali_RELU6_comp_len = 3304; const unsigned char glsl_relu_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -1230,6 +1231,7 @@ const unsigned char glsl_relu_comp[] = { 0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, @@ -1331,10 +1333,10 @@ const unsigned char glsl_relu_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_relu_comp_len = 1636; +unsigned int glsl_relu_comp_len = 1648; const unsigned char glsl_unaryImage_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -1464,7 +1466,7 @@ const unsigned char glsl_unaryImage_comp[] = { unsigned int glsl_unaryImage_comp_len = 1508; const unsigned char glsl_unaryImage_SIGMOID_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -1604,7 +1606,7 @@ const unsigned char glsl_unaryImage_SIGMOID_comp[] = { unsigned int glsl_unaryImage_SIGMOID_comp_len = 1632; const unsigned char glsl_unaryImage_TANH_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -1736,7 +1738,7 @@ const unsigned char glsl_unaryImage_TANH_comp[] = { unsigned int glsl_unaryImage_TANH_comp_len = 1532; const unsigned char glsl_unaryImage_ABS_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -1868,7 +1870,7 @@ const unsigned char glsl_unaryImage_ABS_comp[] = { unsigned int glsl_unaryImage_ABS_comp_len = 1532; const unsigned char glsl_unaryImage_SQRT_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2000,7 +2002,7 @@ const unsigned char glsl_unaryImage_SQRT_comp[] = { unsigned int glsl_unaryImage_SQRT_comp_len = 1532; const unsigned char glsl_unaryImage_RSQRT_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2132,7 +2134,7 @@ const unsigned char glsl_unaryImage_RSQRT_comp[] = { unsigned int glsl_unaryImage_RSQRT_comp_len = 1532; const unsigned char glsl_unaryImage_NEG_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2263,7 +2265,7 @@ const unsigned char glsl_unaryImage_NEG_comp[] = { unsigned int glsl_unaryImage_NEG_comp_len = 1524; const unsigned char glsl_unaryImage_SQUARE_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2395,7 +2397,7 @@ const unsigned char glsl_unaryImage_SQUARE_comp[] = { unsigned int glsl_unaryImage_SQUARE_comp_len = 1528; const unsigned char glsl_unaryImage_EXP_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2527,7 +2529,7 @@ const unsigned char glsl_unaryImage_EXP_comp[] = { unsigned int glsl_unaryImage_EXP_comp_len = 1532; const unsigned char glsl_unaryImage_SIGN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2659,7 +2661,7 @@ const unsigned char glsl_unaryImage_SIGN_comp[] = { unsigned int glsl_unaryImage_SIGN_comp_len = 1532; const unsigned char glsl_unaryImage_LOG_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2797,7 +2799,7 @@ const unsigned char glsl_unaryImage_LOG_comp[] = { unsigned int glsl_unaryImage_LOG_comp_len = 1604; const unsigned char glsl_unaryImage_TAN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -2929,7 +2931,7 @@ const unsigned char glsl_unaryImage_TAN_comp[] = { unsigned int glsl_unaryImage_TAN_comp_len = 1532; const unsigned char glsl_unaryImage_COS_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3061,7 +3063,7 @@ const unsigned char glsl_unaryImage_COS_comp[] = { unsigned int glsl_unaryImage_COS_comp_len = 1532; const unsigned char glsl_unaryImage_SIN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3193,7 +3195,7 @@ const unsigned char glsl_unaryImage_SIN_comp[] = { unsigned int glsl_unaryImage_SIN_comp_len = 1532; const unsigned char glsl_unaryImage_CEIL_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3325,7 +3327,7 @@ const unsigned char glsl_unaryImage_CEIL_comp[] = { unsigned int glsl_unaryImage_CEIL_comp_len = 1532; const unsigned char glsl_unaryImage_FLOOR_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3457,7 +3459,7 @@ const unsigned char glsl_unaryImage_FLOOR_comp[] = { unsigned int glsl_unaryImage_FLOOR_comp_len = 1532; const unsigned char glsl_unaryImage_EXPM1_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3594,7 +3596,7 @@ const unsigned char glsl_unaryImage_EXPM1_comp[] = { unsigned int glsl_unaryImage_EXPM1_comp_len = 1596; const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3729,7 +3731,7 @@ const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = { unsigned int glsl_unaryImage_RECIPROCAL_comp_len = 1572; const unsigned char glsl_unaryImage_SINH_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3861,7 +3863,7 @@ const unsigned char glsl_unaryImage_SINH_comp[] = { unsigned int glsl_unaryImage_SINH_comp_len = 1532; const unsigned char glsl_unaryImage_ASINH_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -3993,7 +3995,7 @@ const unsigned char glsl_unaryImage_ASINH_comp[] = { unsigned int glsl_unaryImage_ASINH_comp_len = 1532; const unsigned char glsl_unaryImage_ASIN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4125,7 +4127,7 @@ const unsigned char glsl_unaryImage_ASIN_comp[] = { unsigned int glsl_unaryImage_ASIN_comp_len = 1532; const unsigned char glsl_unaryImage_COSH_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4257,7 +4259,7 @@ const unsigned char glsl_unaryImage_COSH_comp[] = { unsigned int glsl_unaryImage_COSH_comp_len = 1532; const unsigned char glsl_unaryImage_ACOS_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4389,7 +4391,7 @@ const unsigned char glsl_unaryImage_ACOS_comp[] = { unsigned int glsl_unaryImage_ACOS_comp_len = 1532; const unsigned char glsl_unaryImage_ACOSH_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4521,7 +4523,7 @@ const unsigned char glsl_unaryImage_ACOSH_comp[] = { unsigned int glsl_unaryImage_ACOSH_comp_len = 1532; const unsigned char glsl_unaryImage_ATAN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4653,7 +4655,7 @@ const unsigned char glsl_unaryImage_ATAN_comp[] = { unsigned int glsl_unaryImage_ATAN_comp_len = 1532; const unsigned char glsl_unaryImage_ATANH_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4785,7 +4787,7 @@ const unsigned char glsl_unaryImage_ATANH_comp[] = { unsigned int glsl_unaryImage_ATANH_comp_len = 1532; const unsigned char glsl_unaryImage_LOG1P_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -4922,7 +4924,7 @@ const unsigned char glsl_unaryImage_LOG1P_comp[] = { unsigned int glsl_unaryImage_LOG1P_comp_len = 1596; const unsigned char glsl_unaryImage_ROUND_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -5053,8 +5055,177 @@ const unsigned char glsl_unaryImage_ROUND_comp[] = { }; unsigned int glsl_unaryImage_ROUND_comp_len = 1532; +const unsigned char glsl_unaryImage_HARDSWISH_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, + 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, + 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0xb8, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, + 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, + 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, + 0x6f, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, + 0x6f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x0a, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x0a, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x03, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0xc0, 0x2c, 0x00, 0x07, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, + 0x2c, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, + 0x53, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x40, + 0x2c, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x6d, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x7f, 0x00, 0x00, 0x00, 0xab, 0xaa, 0x2a, 0x3e, 0x2c, 0x00, 0x07, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, + 0x7f, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x03, 0x00, 0x07, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, + 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, + 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, + 0x1d, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x19, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x27, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x87, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x06, 0x00, 0x07, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x87, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, + 0x7b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x64, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x07, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xba, 0x00, 0x05, 0x00, + 0x50, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x56, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x05, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x5b, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, + 0xa9, 0x00, 0x06, 0x00, 0x41, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, + 0x5b, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00, + 0x56, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x5a, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x5c, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x6b, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x05, 0x00, 0x41, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x5f, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x6d, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, + 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_unaryImage_HARDSWISH_comp_len = 1972; + const unsigned char glsl_im2col_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -5354,7 +5525,7 @@ const unsigned char glsl_im2col_comp[] = { unsigned int glsl_im2col_comp_len = 3548; const unsigned char glsl_convolutionDepthwise_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -5624,7 +5795,7 @@ const unsigned char glsl_convolutionDepthwise_comp[] = { unsigned int glsl_convolutionDepthwise_comp_len = 3184; const unsigned char glsl_convolutionDepthwise_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -5900,7 +6071,7 @@ const unsigned char glsl_convolutionDepthwise_RELU_comp[] = { unsigned int glsl_convolutionDepthwise_RELU_comp_len = 3256; const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -6180,7 +6351,7 @@ const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = { unsigned int glsl_convolutionDepthwise_RELU6_comp_len = 3304; const unsigned char glsl_relu6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -6217,6 +6388,7 @@ const unsigned char glsl_relu6_comp[] = { 0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, @@ -6320,10 +6492,10 @@ const unsigned char glsl_relu6_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_relu6_comp_len = 1660; +unsigned int glsl_relu6_comp_len = 1672; const unsigned char glsl_convolution_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -6687,7 +6859,7 @@ const unsigned char glsl_convolution_comp[] = { unsigned int glsl_convolution_comp_len = 4316; const unsigned char glsl_convolution_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x1d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -7053,7 +7225,7 @@ const unsigned char glsl_convolution_RELU_comp[] = { unsigned int glsl_convolution_RELU_comp_len = 4344; const unsigned char glsl_convolution_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x1f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -7423,7 +7595,7 @@ const unsigned char glsl_convolution_RELU6_comp[] = { unsigned int glsl_convolution_RELU6_comp_len = 4392; const unsigned char glsl_binaryImage_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -7578,7 +7750,7 @@ const unsigned char glsl_binaryImage_comp[] = { unsigned int glsl_binaryImage_comp_len = 1804; const unsigned char glsl_binaryImage_ADD_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -7764,7 +7936,7 @@ const unsigned char glsl_binaryImage_ADD_comp[] = { unsigned int glsl_binaryImage_ADD_comp_len = 2180; const unsigned char glsl_binaryImage_SUB_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -7950,7 +8122,7 @@ const unsigned char glsl_binaryImage_SUB_comp[] = { unsigned int glsl_binaryImage_SUB_comp_len = 2180; const unsigned char glsl_binaryImage_MUL_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -8136,7 +8308,7 @@ const unsigned char glsl_binaryImage_MUL_comp[] = { unsigned int glsl_binaryImage_MUL_comp_len = 2180; const unsigned char glsl_binaryImage_DIV_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -8334,7 +8506,7 @@ const unsigned char glsl_binaryImage_DIV_comp[] = { unsigned int glsl_binaryImage_DIV_comp_len = 2320; const unsigned char glsl_binaryImage_POW_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -8521,7 +8693,7 @@ const unsigned char glsl_binaryImage_POW_comp[] = { unsigned int glsl_binaryImage_POW_comp_len = 2188; const unsigned char glsl_binaryImage_VMAX_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -8708,7 +8880,7 @@ const unsigned char glsl_binaryImage_VMAX_comp[] = { unsigned int glsl_binaryImage_VMAX_comp_len = 2188; const unsigned char glsl_binaryImage_SQUDIFF_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -8896,7 +9068,7 @@ const unsigned char glsl_binaryImage_SQUDIFF_comp[] = { unsigned int glsl_binaryImage_SQUDIFF_comp_len = 2200; const unsigned char glsl_binaryImage_VMIN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -9083,7 +9255,7 @@ const unsigned char glsl_binaryImage_VMIN_comp[] = { unsigned int glsl_binaryImage_VMIN_comp_len = 2188; const unsigned char glsl_matmul_input_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -9259,7 +9431,7 @@ const unsigned char glsl_matmul_input_comp[] = { unsigned int glsl_matmul_input_comp_len = 2056; const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -9464,7 +9636,7 @@ const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = { unsigned int glsl_matmul_input_TRANSPOSE_comp_len = 2408; const unsigned char glsl_nchwToimage_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -9731,7 +9903,7 @@ const unsigned char glsl_nchwToimage_comp[] = { unsigned int glsl_nchwToimage_comp_len = 3156; const unsigned char glsl_packAsImage4x4_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x13, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, @@ -9999,7 +10171,7 @@ const unsigned char glsl_packAsImage4x4_comp[] = { unsigned int glsl_packAsImage4x4_comp_len = 3160; const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x44, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, @@ -10303,7 +10475,7 @@ const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = { unsigned int glsl_packAsImage4x4_TRANSPOSE_comp_len = 3592; const unsigned char glsl_roipooling_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x24, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -10623,7 +10795,7 @@ const unsigned char glsl_roipooling_comp[] = { unsigned int glsl_roipooling_comp_len = 3788; const unsigned char glsl_blit_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -10813,7 +10985,7 @@ const unsigned char glsl_blit_comp[] = { unsigned int glsl_blit_comp_len = 2232; const unsigned char glsl_blit_image_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -11035,7 +11207,7 @@ const unsigned char glsl_blit_image_comp[] = { unsigned int glsl_blit_image_comp_len = 2616; const unsigned char glsl_fill_image_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -11154,7 +11326,7 @@ const unsigned char glsl_fill_image_comp[] = { unsigned int glsl_fill_image_comp_len = 1380; const unsigned char glsl_imageTonchw_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -11199,6 +11371,7 @@ const unsigned char glsl_imageTonchw_comp[] = { 0x3f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -11444,10 +11617,10 @@ const unsigned char glsl_imageTonchw_comp[] = { 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_imageTonchw_comp_len = 3464; +unsigned int glsl_imageTonchw_comp_len = 3476; const unsigned char glsl_softmaxHeight_NHWC_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -11670,7 +11843,7 @@ const unsigned char glsl_softmaxHeight_NHWC_comp[] = { unsigned int glsl_softmaxHeight_NHWC_comp_len = 2628; const unsigned char glsl_resizeNearest_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -11859,7 +12032,7 @@ const unsigned char glsl_resizeNearest_comp[] = { unsigned int glsl_resizeNearest_comp_len = 2216; const unsigned char glsl_reduce_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -12007,7 +12180,7 @@ const unsigned char glsl_reduce_comp[] = { unsigned int glsl_reduce_comp_len = 1720; const unsigned char glsl_reduce_VMAX_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -12178,7 +12351,7 @@ const unsigned char glsl_reduce_VMAX_comp[] = { unsigned int glsl_reduce_VMAX_comp_len = 1996; const unsigned char glsl_reduce_VMIN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -12349,7 +12522,7 @@ const unsigned char glsl_reduce_VMIN_comp[] = { unsigned int glsl_reduce_VMIN_comp_len = 1996; const unsigned char glsl_reduce_MEAN_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -12525,7 +12698,7 @@ const unsigned char glsl_reduce_MEAN_comp[] = { unsigned int glsl_reduce_MEAN_comp_len = 2060; const unsigned char glsl_reduce_PROD_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -12695,7 +12868,7 @@ const unsigned char glsl_reduce_PROD_comp[] = { unsigned int glsl_reduce_PROD_comp_len = 1988; const unsigned char glsl_reduce_SUM_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -12865,7 +13038,7 @@ const unsigned char glsl_reduce_SUM_comp[] = { unsigned int glsl_reduce_SUM_comp_len = 1988; const unsigned char glsl_resizeBilinear_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -13118,7 +13291,7 @@ const unsigned char glsl_resizeBilinear_comp[] = { unsigned int glsl_resizeBilinear_comp_len = 2988; const unsigned char glsl_nchwTonc4hw4_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x37, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -13432,7 +13605,7 @@ const unsigned char glsl_nchwTonc4hw4_comp[] = { unsigned int glsl_nchwTonc4hw4_comp_len = 3716; const unsigned char glsl_nc4hw4Tonchw_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -13716,7 +13889,7 @@ const unsigned char glsl_nc4hw4Tonchw_comp[] = { unsigned int glsl_nc4hw4Tonchw_comp_len = 3360; const unsigned char glsl_buffer2Image2D_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -13849,7 +14022,7 @@ const unsigned char glsl_buffer2Image2D_comp[] = { unsigned int glsl_buffer2Image2D_comp_len = 1544; const unsigned char glsl_im2col1x1_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -14105,7 +14278,7 @@ const unsigned char glsl_im2col1x1_comp[] = { unsigned int glsl_im2col1x1_comp_len = 3016; const unsigned char glsl_avgpool_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -14333,7 +14506,7 @@ const unsigned char glsl_avgpool_comp[] = { unsigned int glsl_avgpool_comp_len = 2684; const unsigned char glsl_unPackImage4x4_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -14563,7 +14736,7 @@ const unsigned char glsl_unPackImage4x4_comp[] = { unsigned int glsl_unPackImage4x4_comp_len = 2712; const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -14829,7 +15002,7 @@ const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = { unsigned int glsl_unPackImage4x4_TRANSPOSE_comp_len = 3144; const unsigned char glsl_maxpool_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -15041,7 +15214,7 @@ const unsigned char glsl_maxpool_comp[] = { unsigned int glsl_maxpool_comp_len = 2496; const unsigned char glsl_winogradTransformDest2_3_1_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x97, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -15463,7 +15636,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_comp[] = { unsigned int glsl_winogradTransformDest2_3_1_comp_len = 5016; const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xa1, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -15898,7 +16071,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = { unsigned int glsl_winogradTransformDest2_3_1_RELU_comp_len = 5172; const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xa3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -16338,7 +16511,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = { unsigned int glsl_winogradTransformDest2_3_1_RELU6_comp_len = 5232; const unsigned char glsl_winogradTransformSource2_3_1_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x3a, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -16880,7 +17053,7 @@ const unsigned char glsl_winogradTransformSource2_3_1_comp[] = { unsigned int glsl_winogradTransformSource2_3_1_comp_len = 6456; const unsigned char glsl_col2Im_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xa7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -17108,7 +17281,7 @@ const unsigned char glsl_col2Im_comp[] = { unsigned int glsl_col2Im_comp_len = 2680; const unsigned char glsl_col2Im_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xab, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -17342,7 +17515,7 @@ const unsigned char glsl_col2Im_RELU_comp[] = { unsigned int glsl_col2Im_RELU_comp_len = 2752; const unsigned char glsl_col2Im_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -17580,7 +17753,7 @@ const unsigned char glsl_col2Im_RELU6_comp[] = { unsigned int glsl_col2Im_RELU6_comp_len = 2800; const unsigned char glsl_nc4hw4toimage_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -17773,7 +17946,7 @@ const unsigned char glsl_nc4hw4toimage_comp[] = { unsigned int glsl_nc4hw4toimage_comp_len = 2260; const unsigned char glsl_matmul_kernel_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -17985,7 +18158,7 @@ const unsigned char glsl_matmul_kernel_comp[] = { unsigned int glsl_matmul_kernel_comp_len = 2488; const unsigned char glsl_matmul_kernel_TRANSPOSE_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -18154,7 +18327,7 @@ const unsigned char glsl_matmul_kernel_TRANSPOSE_comp[] = { unsigned int glsl_matmul_kernel_TRANSPOSE_comp_len = 1976; const unsigned char glsl_imageTonc4hw4_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, @@ -18205,6 +18378,7 @@ const unsigned char glsl_imageTonc4hw4_comp[] = { 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, @@ -18344,10 +18518,10 @@ const unsigned char glsl_imageTonc4hw4_comp[] = { 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_imageTonc4hw4_comp_len = 2264; +unsigned int glsl_imageTonc4hw4_comp_len = 2276; const unsigned char glsl_matmul_output_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -18523,7 +18697,7 @@ const unsigned char glsl_matmul_output_comp[] = { unsigned int glsl_matmul_output_comp_len = 2056; const unsigned char glsl_matmul_output_BIAS_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -18723,7 +18897,7 @@ const unsigned char glsl_matmul_output_BIAS_comp[] = { unsigned int glsl_matmul_output_BIAS_comp_len = 2348; const unsigned char glsl_gemm16x16_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x73, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -19070,7 +19244,7 @@ const unsigned char glsl_gemm16x16_comp[] = { unsigned int glsl_gemm16x16_comp_len = 4108; const unsigned char glsl_gemm16x16_FP16_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x59, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x09, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, @@ -19439,7 +19613,7 @@ const unsigned char glsl_gemm16x16_FP16_comp[] = { unsigned int glsl_gemm16x16_FP16_comp_len = 4372; const unsigned char glsl_deconvolutionDepthwise_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -19776,7 +19950,7 @@ const unsigned char glsl_deconvolutionDepthwise_comp[] = { unsigned int glsl_deconvolutionDepthwise_comp_len = 3992; const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -20119,7 +20293,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = { unsigned int glsl_deconvolutionDepthwise_RELU_comp_len = 4064; const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -20466,7 +20640,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = { unsigned int glsl_deconvolutionDepthwise_RELU6_comp_len = 4112; const unsigned char glsl_preluWithChannel_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -20511,6 +20685,7 @@ const unsigned char glsl_preluWithChannel_comp[] = { 0x46, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, @@ -20626,10 +20801,10 @@ const unsigned char glsl_preluWithChannel_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x34, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_preluWithChannel_comp_len = 1900; +unsigned int glsl_preluWithChannel_comp_len = 1912; const unsigned char glsl_deconvIm2Col_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x29, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -20991,7 +21166,7 @@ const unsigned char glsl_deconvIm2Col_comp[] = { unsigned int glsl_deconvIm2Col_comp_len = 4296; const unsigned char glsl_deconvIm2Col_RELU_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -21359,7 +21534,7 @@ const unsigned char glsl_deconvIm2Col_RELU_comp[] = { unsigned int glsl_deconvIm2Col_RELU_comp_len = 4368; const unsigned char glsl_deconvIm2Col_RELU6_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -21731,7 +21906,7 @@ const unsigned char glsl_deconvIm2Col_RELU6_comp[] = { unsigned int glsl_deconvIm2Col_RELU6_comp_len = 4416; const unsigned char glsl_buffer2Image1D_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, @@ -21839,7 +22014,7 @@ const unsigned char glsl_buffer2Image1D_comp[] = { unsigned int glsl_buffer2Image1D_comp_len = 1244; const unsigned char glsl_scale_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, @@ -22008,7 +22183,7 @@ const unsigned char glsl_scale_comp[] = { unsigned int glsl_scale_comp_len = 1976; const unsigned char glsl_buffer2Image3D_comp[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x08, 0x00, 0x08, 0x00, + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0a, 0x00, 0x08, 0x00, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, diff --git a/source/backend/vulkan/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/compiler/VulkanShaderMap.cpp index c307a10a..b8997d82 100644 --- a/source/backend/vulkan/compiler/VulkanShaderMap.cpp +++ b/source/backend/vulkan/compiler/VulkanShaderMap.cpp @@ -37,6 +37,7 @@ mMaps.insert(std::make_pair("glsl_unaryImage_ATAN_comp", std::make_pair(glsl_una mMaps.insert(std::make_pair("glsl_unaryImage_ATANH_comp", std::make_pair(glsl_unaryImage_ATANH_comp,glsl_unaryImage_ATANH_comp_len))); mMaps.insert(std::make_pair("glsl_unaryImage_LOG1P_comp", std::make_pair(glsl_unaryImage_LOG1P_comp,glsl_unaryImage_LOG1P_comp_len))); mMaps.insert(std::make_pair("glsl_unaryImage_ROUND_comp", std::make_pair(glsl_unaryImage_ROUND_comp,glsl_unaryImage_ROUND_comp_len))); +mMaps.insert(std::make_pair("glsl_unaryImage_HARDSWISH_comp", std::make_pair(glsl_unaryImage_HARDSWISH_comp,glsl_unaryImage_HARDSWISH_comp_len))); mMaps.insert(std::make_pair("glsl_im2col_comp", std::make_pair(glsl_im2col_comp,glsl_im2col_comp_len))); mMaps.insert(std::make_pair("glsl_convolutionDepthwise_comp", std::make_pair(glsl_convolutionDepthwise_comp,glsl_convolutionDepthwise_comp_len))); mMaps.insert(std::make_pair("glsl_convolutionDepthwise_RELU_comp", std::make_pair(glsl_convolutionDepthwise_RELU_comp,glsl_convolutionDepthwise_RELU_comp_len))); diff --git a/source/backend/vulkan/execution/VulkanRelu.cpp b/source/backend/vulkan/execution/VulkanRelu.cpp index b9980ad1..3cb0c64f 100644 --- a/source/backend/vulkan/execution/VulkanRelu.cpp +++ b/source/backend/vulkan/execution/VulkanRelu.cpp @@ -163,6 +163,7 @@ public: } else { return new VulkanPrelu(bn, op); } + return nullptr; } }; diff --git a/source/backend/vulkan/execution/VulkanUnary.cpp b/source/backend/vulkan/execution/VulkanUnary.cpp index 9dcac7ab..5b331fd5 100644 --- a/source/backend/vulkan/execution/VulkanUnary.cpp +++ b/source/backend/vulkan/execution/VulkanUnary.cpp @@ -72,6 +72,7 @@ static std::string _getMidType(const Op* op) { SETTYPE(UnaryOpOperation_LOG1P, "LOG1P"); SETTYPE(UnaryOpOperation_ROUND, "ROUND"); + SETTYPE(UnaryOpOperation_HARDSWISH, "HARDSWISH"); } while(false); #undef SETTYPE } diff --git a/source/backend/vulkan/execution/glsl/col2Im.comp b/source/backend/vulkan/execution/glsl/col2Im.comp index c01ebf0a..cc1973ab 100644 --- a/source/backend/vulkan/execution/glsl/col2Im.comp +++ b/source/backend/vulkan/execution/glsl/col2Im.comp @@ -1,5 +1,4 @@ #version 440 core -layout(std430) uniform; layout(set=0, binding=0) uniform sampler2D uInput; layout(set=0, binding=1) writeonly uniform image3D uOutput; diff --git a/source/backend/vulkan/execution/glsl/deconvCol2Im.comp b/source/backend/vulkan/execution/glsl/deconvCol2Im.comp index be4f2f74..7035f1ae 100644 --- a/source/backend/vulkan/execution/glsl/deconvCol2Im.comp +++ b/source/backend/vulkan/execution/glsl/deconvCol2Im.comp @@ -1,5 +1,4 @@ #version 440 core -layout(std430) uniform; layout(set=0, binding=0) uniform mediump sampler3D uInput; layout(set=0, binding=1) writeonly uniform mediump image2D uOutput; diff --git a/source/backend/vulkan/execution/glsl/deconvIm2Col.comp b/source/backend/vulkan/execution/glsl/deconvIm2Col.comp index a8268ebf..c4327456 100644 --- a/source/backend/vulkan/execution/glsl/deconvIm2Col.comp +++ b/source/backend/vulkan/execution/glsl/deconvIm2Col.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly mediump uniform image3D uOutput; layout(set=0, binding=1) uniform mediump sampler2D uInput; diff --git a/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp b/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp index f72aca57..3670eb1f 100644 --- a/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp +++ b/source/backend/vulkan/execution/glsl/deconvolutionDepthwise.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly mediump uniform image3D uOutput; layout(set=0, binding=1) uniform mediump sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/dwweightcopy.comp b/source/backend/vulkan/execution/glsl/dwweightcopy.comp index 293ea802..e8886bac 100644 --- a/source/backend/vulkan/execution/glsl/dwweightcopy.comp +++ b/source/backend/vulkan/execution/glsl/dwweightcopy.comp @@ -1,5 +1,4 @@ #version 440 core -layout(std430) uniform; layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/im2col.comp b/source/backend/vulkan/execution/glsl/im2col.comp index 132a63db..bbad8cce 100644 --- a/source/backend/vulkan/execution/glsl/im2col.comp +++ b/source/backend/vulkan/execution/glsl/im2col.comp @@ -1,6 +1,5 @@ #version 440 core layout(std140) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly mediump uniform image2D uOutput; layout(set=0, binding=1) uniform mediump sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/im2col1x1.comp b/source/backend/vulkan/execution/glsl/im2col1x1.comp index f1cb0bd6..1382b6d5 100644 --- a/source/backend/vulkan/execution/glsl/im2col1x1.comp +++ b/source/backend/vulkan/execution/glsl/im2col1x1.comp @@ -1,6 +1,5 @@ #version 440 core layout(std140) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly mediump uniform image2D uOutput; layout(set=0, binding=1) mediump uniform sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp b/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp index 8f7ea9b3..02a4c00a 100644 --- a/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp +++ b/source/backend/vulkan/execution/glsl/imageTonc4hw4.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) uniform mediump sampler3D uInput; layout(set=0, binding=1) writeonly buffer destBuffer{ diff --git a/source/backend/vulkan/execution/glsl/imageTonchw.comp b/source/backend/vulkan/execution/glsl/imageTonchw.comp index cb4fb8d9..41f02caf 100644 --- a/source/backend/vulkan/execution/glsl/imageTonchw.comp +++ b/source/backend/vulkan/execution/glsl/imageTonchw.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) uniform mediump sampler3D uInput; layout(set=0, binding=1) writeonly buffer destBuffer{ diff --git a/source/backend/vulkan/execution/glsl/macro.json b/source/backend/vulkan/execution/glsl/macro.json index 8a13bce8..e7324e2f 100644 --- a/source/backend/vulkan/execution/glsl/macro.json +++ b/source/backend/vulkan/execution/glsl/macro.json @@ -55,7 +55,8 @@ "ATAN", "ATANH", "LOG1P", - "ROUND" + "ROUND", + "HARDSWISH" ], "unPackImage4x4.comp":[ "TRANSPOSE" diff --git a/source/backend/vulkan/execution/glsl/matmul_input.comp b/source/backend/vulkan/execution/glsl/matmul_input.comp index 54a6881d..e9286d53 100644 --- a/source/backend/vulkan/execution/glsl/matmul_input.comp +++ b/source/backend/vulkan/execution/glsl/matmul_input.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; layout(set=0, binding=1) uniform sampler3D uInput; layout(set=0, binding=2) uniform offsetBuffer { diff --git a/source/backend/vulkan/execution/glsl/matmul_kernel.comp b/source/backend/vulkan/execution/glsl/matmul_kernel.comp index e51801b3..100282e7 100644 --- a/source/backend/vulkan/execution/glsl/matmul_kernel.comp +++ b/source/backend/vulkan/execution/glsl/matmul_kernel.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; layout(set=0, binding=1) uniform sampler3D uInput; layout(set=0, binding=2) uniform offsetBuffer { diff --git a/source/backend/vulkan/execution/glsl/matmul_output.comp b/source/backend/vulkan/execution/glsl/matmul_output.comp index ee355238..410538af 100644 --- a/source/backend/vulkan/execution/glsl/matmul_output.comp +++ b/source/backend/vulkan/execution/glsl/matmul_output.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform offsetBuffer { diff --git a/source/backend/vulkan/execution/glsl/maxpool.comp b/source/backend/vulkan/execution/glsl/maxpool.comp index ecac349b..20a6adf2 100644 --- a/source/backend/vulkan/execution/glsl/maxpool.comp +++ b/source/backend/vulkan/execution/glsl/maxpool.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict uniform image3D uOutput; layout(set=0, binding=1) uniform sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp b/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp index eab099f6..1739261a 100644 --- a/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp +++ b/source/backend/vulkan/execution/glsl/nc4hw4toimage.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput; layout(set=0, binding=1) readonly buffer destBuffer{ diff --git a/source/backend/vulkan/execution/glsl/nchwToimage.comp b/source/backend/vulkan/execution/glsl/nchwToimage.comp index 84ffc78c..5d55a991 100644 --- a/source/backend/vulkan/execution/glsl/nchwToimage.comp +++ b/source/backend/vulkan/execution/glsl/nchwToimage.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput; layout(set=0, binding=1) readonly buffer sourceBuffer{ diff --git a/source/backend/vulkan/execution/glsl/packAsImage4x4.comp b/source/backend/vulkan/execution/glsl/packAsImage4x4.comp index 59e13720..43ec6b1f 100644 --- a/source/backend/vulkan/execution/glsl/packAsImage4x4.comp +++ b/source/backend/vulkan/execution/glsl/packAsImage4x4.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict highp uniform image2D uOutput; layout(set=0, binding=1) readonly buffer sourceBuffer{ diff --git a/source/backend/vulkan/execution/glsl/preluWithChannel.comp b/source/backend/vulkan/execution/glsl/preluWithChannel.comp index 69f8e915..d5320a07 100644 --- a/source/backend/vulkan/execution/glsl/preluWithChannel.comp +++ b/source/backend/vulkan/execution/glsl/preluWithChannel.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput; layout(set=0, binding=1) uniform mediump sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/relu.comp b/source/backend/vulkan/execution/glsl/relu.comp index eb0fec02..dee41cc4 100644 --- a/source/backend/vulkan/execution/glsl/relu.comp +++ b/source/backend/vulkan/execution/glsl/relu.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput; layout(set=0, binding=1) uniform mediump sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/relu6.comp b/source/backend/vulkan/execution/glsl/relu6.comp index 3223b38a..7284753b 100644 --- a/source/backend/vulkan/execution/glsl/relu6.comp +++ b/source/backend/vulkan/execution/glsl/relu6.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict mediump uniform image3D uOutput; layout(set=0, binding=1) uniform mediump sampler3D uInput; diff --git a/source/backend/vulkan/execution/glsl/resizeBilinear.comp b/source/backend/vulkan/execution/glsl/resizeBilinear.comp index aec8d2e9..51800700 100644 --- a/source/backend/vulkan/execution/glsl/resizeBilinear.comp +++ b/source/backend/vulkan/execution/glsl/resizeBilinear.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) uniform mediump sampler3D uInput; layout(set=0, binding=1) writeonly restrict mediump uniform image3D uOutput; diff --git a/source/backend/vulkan/execution/glsl/resizeNearest.comp b/source/backend/vulkan/execution/glsl/resizeNearest.comp index ce007f92..e688b4c9 100644 --- a/source/backend/vulkan/execution/glsl/resizeNearest.comp +++ b/source/backend/vulkan/execution/glsl/resizeNearest.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) uniform mediump sampler3D uInput; layout(set=0, binding=1) writeonly restrict mediump uniform image3D uOutput; diff --git a/source/backend/vulkan/execution/glsl/unPackImage4x4.comp b/source/backend/vulkan/execution/glsl/unPackImage4x4.comp index fcbd0c52..c10f6812 100644 --- a/source/backend/vulkan/execution/glsl/unPackImage4x4.comp +++ b/source/backend/vulkan/execution/glsl/unPackImage4x4.comp @@ -1,6 +1,5 @@ #version 440 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) uniform mediump sampler2D uInput; layout(set=0, binding=1) writeonly buffer sourceBuffer{ diff --git a/source/backend/vulkan/execution/glsl/unaryImage.comp b/source/backend/vulkan/execution/glsl/unaryImage.comp index d78eca46..88d2497f 100644 --- a/source/backend/vulkan/execution/glsl/unaryImage.comp +++ b/source/backend/vulkan/execution/glsl/unaryImage.comp @@ -102,6 +102,11 @@ void main() #endif #ifdef ROUND value = round(value); +#endif +#ifdef HARDSWISH + const vec4 leftMask = vec4(greaterThan(value, vec4(-3.0f))); + const vec4 rightMask = vec4(lessThan(value, vec4(3.0f))); + value = leftMask*value*(rightMask*((value+3.0f)/6.0f) + 1.0f - rightMask); #endif imageStore(uOutput, pos, value); } diff --git a/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp b/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp index e0987982..acf42fbb 100644 --- a/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp +++ b/source/backend/vulkan/execution/glsl/winogradTransformDest2_3_1.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict uniform image3D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform sampler2D uBias; diff --git a/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp b/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp index fa8e3f68..3e3b6996 100644 --- a/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp +++ b/source/backend/vulkan/execution/glsl/winogradTransformSource2_3_1.comp @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; layout(set=0, binding=1) uniform sampler3D uInput; layout(set=0, binding=2) readonly restrict uniform constBuffer { diff --git a/source/backend/vulkan/runtime/VulkanRuntime.cpp b/source/backend/vulkan/runtime/VulkanRuntime.cpp index b0b63e14..eba526df 100644 --- a/source/backend/vulkan/runtime/VulkanRuntime.cpp +++ b/source/backend/vulkan/runtime/VulkanRuntime.cpp @@ -88,7 +88,8 @@ void VulkanRuntime::onGabageCollect(int level) { mPipelineFactory->reset(); } -Backend* VulkanRuntime::onCreate() const { +Backend* VulkanRuntime::onCreate(const BackendConfig* config) const { + // FIXME: Use config return new VulkanBackend(this, mInfo); } static bool _testVulkan() { diff --git a/source/backend/vulkan/runtime/VulkanRuntime.hpp b/source/backend/vulkan/runtime/VulkanRuntime.hpp index 01a3e4a7..50ce79b6 100644 --- a/source/backend/vulkan/runtime/VulkanRuntime.hpp +++ b/source/backend/vulkan/runtime/VulkanRuntime.hpp @@ -25,7 +25,7 @@ public: VulkanRuntime(const Backend::Info& info); virtual ~ VulkanRuntime(); - virtual Backend* onCreate() const override; + virtual Backend* onCreate(const BackendConfig* config) const override; enum GPUType { ADRENO = 0, MALI = 1, OTHER = 2 }; virtual void onGabageCollect(int level) override; virtual float onGetMemoryInMB() override; diff --git a/source/backend/vulkan/shaders/AllShader.h b/source/backend/vulkan/shaders/AllShader.h index 1484395a..1da55af9 100644 --- a/source/backend/vulkan/shaders/AllShader.h +++ b/source/backend/vulkan/shaders/AllShader.h @@ -68,6 +68,8 @@ extern const unsigned char glsl_unaryImage_LOG1P_comp[]; extern unsigned int glsl_unaryImage_LOG1P_comp_len; extern const unsigned char glsl_unaryImage_ROUND_comp[]; extern unsigned int glsl_unaryImage_ROUND_comp_len; +extern const unsigned char glsl_unaryImage_HARDSWISH_comp[]; +extern unsigned int glsl_unaryImage_HARDSWISH_comp_len; extern const unsigned char glsl_im2col_comp[]; extern unsigned int glsl_im2col_comp_len; extern const unsigned char glsl_convolutionDepthwise_comp[]; diff --git a/source/core/AutoStorage.h b/source/core/AutoStorage.h index 11c3c604..e56eeab6 100644 --- a/source/core/AutoStorage.h +++ b/source/core/AutoStorage.h @@ -108,6 +108,114 @@ private: T* mData = NULL; int mSize = 0; }; + +/** Auto Release Class*/ +template +class AutoRelease { +public: + AutoRelease(T* d = nullptr) { + mData = d; + } + ~AutoRelease() { + if (NULL != mData) { + delete mData; + } + } + AutoRelease(const AutoRelease&) = delete; + T* operator->() { + return mData; + } + void reset(T* d) { + if (nullptr != mData) { + delete mData; + } + mData = d; + } + T* get() { + return mData; + } + const T* get() const { + return mData; + } +private: + T* mData = NULL; +}; + + +class RefCount +{ + public: + void addRef() const + { + mNum++; + } + void decRef() const + { + --mNum; + MNN_ASSERT(mNum>=0); + if (0 >= mNum) + { + delete this; + } + } + protected: + RefCount():mNum(1){} + RefCount(const RefCount& f):mNum(f.mNum){} + void operator=(const RefCount& f) + { + if (this != &f) + { + mNum = f.mNum; + } + } + virtual ~RefCount(){} + private: + inline int count() const{return mNum;} + mutable int mNum; +}; + +#define SAFE_UNREF(x)\ + if (NULL!=(x)) {(x)->decRef();} +#define SAFE_REF(x)\ + if (NULL!=(x)) (x)->addRef(); + +#define SAFE_ASSIGN(dst, src) \ + {\ + if (src!=NULL)\ + {\ + src->addRef();\ + }\ + if (dst!=NULL)\ + {\ + dst->decRef();\ + }\ + dst = src;\ + } +template +class SharedPtr { + public: + SharedPtr() : mT(NULL) {} + SharedPtr(T* obj) : mT(obj) {} + SharedPtr(const SharedPtr& o) : mT(o.mT) { SAFE_REF(mT); } + ~SharedPtr() { SAFE_UNREF(mT); } + + SharedPtr& operator=(const SharedPtr& rp) { + SAFE_ASSIGN(mT, rp.mT); + return *this; + } + SharedPtr& operator=(T* obj) { + SAFE_UNREF(mT); + mT = obj; + return *this; + } + + T* get() const { return mT; } + T& operator*() const { return *mT; } + T* operator->() const { return mT; } + + private: + T* mT; +}; } // namespace MNN #endif /* AutoStorage_h */ diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp index c3827493..c9532676 100644 --- a/source/core/Backend.hpp +++ b/source/core/Backend.hpp @@ -10,7 +10,6 @@ #define Backend_hpp #include -#include #include #include #include @@ -170,8 +169,8 @@ public: * @param qtype quant data type. * @return support type for op. */ - virtual halide_type_t getRunType(const MNN::Op* op, halide_type_t qtype) { - return halide_type_of(); + virtual halide_type_t getRunType(const MNN::Op* op, halide_type_t qtype, halide_type_t rtype) { + return rtype; } public: /** @@ -208,7 +207,7 @@ public: @brief create backend @return created backend */ - virtual Backend* onCreate() const = 0; + virtual Backend* onCreate(const BackendConfig* config = nullptr) const = 0; /** @brief clear unuseful resource diff --git a/source/core/BackendRegister.cpp b/source/core/BackendRegister.cpp index 35c22d49..441f4809 100644 --- a/source/core/BackendRegister.cpp +++ b/source/core/BackendRegister.cpp @@ -31,7 +31,7 @@ void registerBackend() { #if MNN_METAL_ENABLED registerMetalRuntimeCreator(); #endif -#if defined(ENABLE_ARMV82) && defined(__aarch64__) +#if defined(ENABLE_ARMV82) && (defined(__ADNROID__) || defined(__aarch64__)) registerArm82RuntimeCreator(); #endif #endif diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp index b1909036..43776445 100644 --- a/source/core/BufferAllocator.cpp +++ b/source/core/BufferAllocator.cpp @@ -58,7 +58,7 @@ std::shared_ptr BufferAllocator::Allocator::createRe } BufferAllocator::Node::~Node() { - if (nullptr == parent) { + if (nullptr == parent.get()) { outside->onRelease(pointer); } } @@ -90,7 +90,7 @@ std::pair BufferAllocator::alloc(int size, bool seperate) { mTotalSize += size; // save node - std::shared_ptr node(new Node); + SharedPtr node(new Node); node->size = size; node->pointer = pointer; mUsedList[pointer] = node; @@ -102,11 +102,11 @@ std::pair BufferAllocator::alloc(int size, bool seperate) { return pointer; } -void BufferAllocator::returnMemory(FREELIST* listP, std::shared_ptr node, bool permitMerge) { +void BufferAllocator::returnMemory(FREELIST* listP, SharedPtr node, bool permitMerge) { auto& list = *listP; list.insert(std::make_pair(node->size, node)); // update parent use count - if (nullptr != node->parent && permitMerge) { + if (nullptr != node->parent.get() && permitMerge) { auto parent = node->parent; parent->useCount -= 1; @@ -115,7 +115,7 @@ void BufferAllocator::returnMemory(FREELIST* listP, std::shared_ptr node, while (needMerge) { // collect all subnodes for (auto iter = list.begin(); iter != list.end();) { - if (iter->second->parent == parent) { + if (iter->second->parent.get() == parent.get()) { iter = list.erase(iter); continue; } @@ -125,7 +125,7 @@ void BufferAllocator::returnMemory(FREELIST* listP, std::shared_ptr node, // do merge downside up list.insert(std::make_pair(parent->size, parent)); needMerge = false; - if (parent->parent != nullptr) { + if (parent->parent.get() != nullptr) { parent = parent->parent; parent->useCount -= 1; needMerge = parent->useCount == 0; @@ -165,7 +165,7 @@ void BufferAllocator::release(bool allRelease) { return; } for (auto f : mFreeList) { - if (f.second->parent == nullptr) { + if (f.second->parent.get() == nullptr) { MNN_ASSERT(mTotalSize >= f.first); mTotalSize -= f.first; } @@ -210,7 +210,7 @@ std::pair BufferAllocator::getFromFreeList(FREELIST* list, int size, // update parent use count auto pointer = x->second->pointer; - if (permiteSplit && nullptr != x->second->parent) { + if (permiteSplit && nullptr != x->second->parent.get()) { x->second->parent->useCount += 1; } @@ -223,7 +223,7 @@ std::pair BufferAllocator::getFromFreeList(FREELIST* list, int size, } // split otherwise - std::shared_ptr first(new Node); + SharedPtr first(new Node); first->parent = x->second; first->size = sizeAlign; first->pointer = x->second->pointer; @@ -231,7 +231,7 @@ std::pair BufferAllocator::getFromFreeList(FREELIST* list, int size, mUsedList.insert(std::make_pair(pointer, first)); x->second->useCount += 1; - std::shared_ptr second(new Node); + SharedPtr second(new Node); second->outside = mAllocator.get(); second->parent = x->second; second->size = x->second->size - sizeAlign; diff --git a/source/core/BufferAllocator.hpp b/source/core/BufferAllocator.hpp index 079d6bd9..4012e525 100644 --- a/source/core/BufferAllocator.hpp +++ b/source/core/BufferAllocator.hpp @@ -14,6 +14,7 @@ #include #include "MNNMemoryUtils.h" #include "NonCopyable.hpp" +#include "AutoStorage.h" namespace MNN { @@ -92,22 +93,22 @@ public: void endGroup(); private: - class Node { + class Node : public RefCount { public: ~Node(); std::pair pointer; - std::shared_ptr parent = nullptr; + SharedPtr parent = nullptr; int32_t size; int16_t useCount = 0; Allocator* outside = nullptr; }; - typedef std::multimap> FREELIST; + typedef std::multimap> FREELIST; - static void returnMemory(FREELIST* list, std::shared_ptr node, bool permitMerge = true); + static void returnMemory(FREELIST* list, SharedPtr node, bool permitMerge = true); std::pair getFromFreeList(FREELIST* list, int size, bool permiteSplit = true); - std::map, std::shared_ptr> mUsedList; + std::map, SharedPtr> mUsedList; FREELIST mFreeList; size_t mTotalSize = 0; diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp index 6130951a..d4b9dd90 100644 --- a/source/core/ConvolutionCommon.cpp +++ b/source/core/ConvolutionCommon.cpp @@ -363,7 +363,7 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const *len = Size; return blob; } -std::shared_ptr ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat) { +std::shared_ptr ConvolutionCommon::load(const IDSTQuan *quan, bool forceFloat, bool forceInt8) { auto result = std::make_shared(); uint32_t weightLength = 0; int8_t *buffer = nullptr; @@ -393,36 +393,17 @@ std::shared_ptr ConvolutionCommon::load(const IDS // weight int8 only if (4 == quan->type()) { weightLength = quan->buffer()->size(); - result->weightFloat.reset(weightLength); - const int kernelNum = quan->aMax(); - int kernelSize = weightLength / kernelNum; - auto minAndScalsSize = quan->alpha()->size(); - if (minAndScalsSize != (2 * kernelNum)) { - MNN_ERROR("recover int8 weights error.\n"); - } - auto minAndScales = quan->alpha()->data(); - auto int8Weights = quan->buffer()->data(); - auto weightPtr = result->weightFloat.get(); - - for (int k = 0; k < kernelNum; k++) { - auto kernelMinAndScale = minAndScales + k * 2; - float min = kernelMinAndScale[0]; - float scale = kernelMinAndScale[1]; - int beginIndex = k * kernelSize; - for (int s = 0; s < kernelSize; s++) { - int8_t quantWeight = int8Weights[beginIndex + s]; - float oriWeight = (quantWeight - (-128)) * scale + min; - weightPtr[beginIndex + s] = oriWeight; - } - } - return result; + result->weight.reset(weightLength); + ::memcpy(result->weight.get(), quan->buffer()->data(), weightLength); } - if (nullptr == buffer) { - MNN_PRINT("Alloc memory error for extract idst int8\n"); - return nullptr; + if (result->weight.get() == nullptr) { + if (nullptr == buffer) { + MNN_PRINT("Alloc memory error for extract idst int8\n"); + return nullptr; + } + result->weight.set(buffer, weightLength); } - result->weight.set(buffer, weightLength); result->quan = quan; result->alpha.reset(quan->alpha()->size()); if (nullptr == result->alpha.get()) { @@ -430,7 +411,9 @@ std::shared_ptr ConvolutionCommon::load(const IDS return nullptr; } ::memcpy(result->alpha.get(), quan->alpha()->data(), quan->alpha()->size() * sizeof(float)); - + if (forceInt8) { + return result; + } if (!quan->has_scaleInt() || forceFloat) { // Back to float result->weightFloat.reset(weightLength); @@ -451,8 +434,9 @@ std::shared_ptr ConvolutionCommon::load(const IDS if (result->alpha.size() == 2 * outputCount) { float min = result->alpha.get()[2*o]; float alpha = result->alpha.get()[2*o+1]; + float clampMin = quan->aMin(); for (int j = 0; j < partWeightSize; ++j) { - dstW[j] = (( (float)srcW[j] - (-128) ) * alpha + min) * quan->quantScale(); + dstW[j] = (( (float)srcW[j] - clampMin ) * alpha + min) * quan->quantScale(); } } else { float alpha = result->alpha.get()[o]; @@ -483,6 +467,41 @@ void ConvolutionCommon::getConvParameters(std::shared_ptr *quanCommo } } +bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr& quanCommon, + const int8_t*& weight, float*& scale, int32_t*& bias, + float inputScale, float outputScale) { + int outputCount = conv2d->common()->outputCount(); + weight = conv2d->symmetricQuan()->weight()->data(); + if (conv2d->quanParameter() != nullptr) { + quanCommon = ConvolutionCommon::load(conv2d->quanParameter(), false, true); + weight = quanCommon->weight.get(); + } + if (weight == nullptr) { + MNN_ERROR("ConvolutionCommon::getConvInt8Parameters: No weight data!"); + return false; + } + if (conv2d->symmetricQuan()->bias() && conv2d->symmetricQuan()->scale()) { + MNN_ASSERT(conv2d->symmetricQuan()->bias()->size() == outputCount && conv2d->symmetricQuan()->scale()->size() == outputCount); + ::memcpy(bias, conv2d->symmetricQuan()->bias()->data(), outputCount * sizeof(int32_t)); + ::memcpy(scale, conv2d->symmetricQuan()->scale()->data(), outputCount * sizeof(float)); + return true; + } + if (conv2d->bias() && quanCommon->alpha.get()) { + inputScale = inputScale == 0.f ? conv2d->quanParameter()->scaleIn() : inputScale; + outputScale = outputScale == 0.f ? conv2d->quanParameter()->scaleOut() : outputScale; + auto biasData = conv2d->bias()->data(); + auto alphaData = quanCommon->alpha.get(); + auto alphaScale = inputScale / outputScale; + for (int i = 0; i < outputCount; i++) { + scale[i] = alphaData[i] * alphaScale; + bias[i] = static_cast(biasData[i] / (inputScale * alphaData[i])); + } + return true; + } + MNN_ERROR("ConvolutionCommon::getConvInt8Parameters: No bias & scale data!"); + return false; +} + std::pair ConvolutionCommon::convolutionPad(const Tensor *input, const Tensor *output, const Convolution2DCommon *mCommon) { if (mCommon->padMode() == PadMode_SAME) { diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp index ee6c1989..e28049d2 100644 --- a/source/core/ConvolutionCommon.hpp +++ b/source/core/ConvolutionCommon.hpp @@ -20,8 +20,10 @@ public: AutoStorage weightFloat; const IDSTQuan* quan; }; - static std::shared_ptr load(const IDSTQuan* quan, bool forceFloat = false); + static std::shared_ptr load(const IDSTQuan* quan, bool forceFloat = false, bool forceInt8 = false); static void getConvParameters(std::shared_ptr *quanCommon, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize); + static bool getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr& quanCommon, + const int8_t*& weight, float*& scale, int32_t*& bias, float inputScale, float outputScale); // Return padX, padY static std::pair convolutionPad(const Tensor* input, const Tensor* output, diff --git a/source/core/DirectedAcyclicGraph.hpp b/source/core/DirectedAcyclicGraph.hpp deleted file mode 100644 index fdb20c66..00000000 --- a/source/core/DirectedAcyclicGraph.hpp +++ /dev/null @@ -1,204 +0,0 @@ -// -// DirectedAcyclicGraph.hpp -// MNN -// -// Created by MNN on 2019/01/30. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include -#include -#include -#include - -using namespace std; -namespace MNN { -template -class Node; - -template -class Edge { -public: - void setSrc(shared_ptr > node) { - this->srcNode = weak_ptr >(node); - } - - void setDst(shared_ptr > node) { - this->dstNode = weak_ptr >(node); - } - - const weak_ptr > getSrc() { - return srcNode; - } - - const weak_ptr > getDst() { - return dstNode; - } - -private: - weak_ptr > srcNode; - weak_ptr > dstNode; -}; - -template -class Node { -public: - void addInEdge(shared_ptr > edge) { - this->inEdges.insert(edge); - } - - void addOutEdge(shared_ptr > edge) { - this->outEdges.insert(edge); - } - - const unordered_set > > getInEdges() { - return inEdges; - } - - const unordered_set > > getOutEdges() { - return outEdges; - } - - const int getInEdgesCount() { - return (int)inEdges.size(); - } - - void setData(T d) { - this->data = d; - } - - T getData() { - return data; - } - -private: - T data; - unordered_set > > inEdges; - unordered_set > > outEdges; -}; - -template -class NodeDef { -public: - virtual shared_ptr > makeNode() { - return make_shared >(); - } -}; - -/** - * A DirectedAcyclicGraph describes a set of computations that are to be - * performed, as well as the dependencies between those - * computations. The basic model is a DAG (directed acyclic graph) - */ -template -class DirectedAcyclicGraph { -public: - /** - * Adds a new node to this graph, and returns it. - */ - shared_ptr > AddNode(NodeDef& node_def) { - shared_ptr > node = node_def.makeNode(); - nodes.insert(make_pair(node, nodes.size())); - return node; - } - - /** - * Adds an edge that connects `source` input of - * `dest` and returns it. - */ - const shared_ptr > AddEdge(shared_ptr > source, shared_ptr > dest) { - shared_ptr > edge = make_shared >(); - edge->setSrc(source); - edge->setDst(dest); - source->addOutEdge(edge); - dest->addInEdge(edge); - edges.insert(make_pair(edge, edges.size())); - return edge; - } - - /** - * Stores in *order the post-order numbering of all nodes - * in graph found via topological sorting. - * - * return true if graph does not have cycles else false . - */ - bool GetPostOrder(vector > >& order) { - order.clear(); - return TopologicalSort(order); - } - -private: - /** - * Kahn's algorithm - * topological sort - * - * L ← Empty list that will contain the sorted elements - * S ← Set of all nodes with no incoming edge - * while S is non-empty do - * remove a node n from S - * add n to tail of L - * for each node m with an edge e from n to m do - * remove edge e from the graph - * if m has no other incoming edges then - * insert m into S - * if graph has edges then - * return error (graph has at least one cycle) - * else - * return L (a topologically sorted order) - */ - bool TopologicalSort(vector > >& order) { - struct TopoNode { - shared_ptr > node; - unordered_set > > outEdges; - }; - - unordered_map >, unordered_set > > > nodesInEdges; - /*no incoming node*/ - vector noIncoming; - typename unordered_map >, int>::iterator iter; - for (iter = this->nodes.begin(); iter != this->nodes.end(); iter++) { - if (iter->first->getInEdgesCount() <= 0) { - TopoNode tn; - tn.node = iter->first; - tn.outEdges = iter->first->getOutEdges(); - noIncoming.push_back(tn); - } else { - nodesInEdges.insert(make_pair(iter->first, iter->first->getInEdges())); - } - } - while (noIncoming.size() > 0) { - TopoNode n = noIncoming.back(); - noIncoming.pop_back(); - order.push_back(n.node); - for (const shared_ptr >& outEdge : n.outEdges) { - const weak_ptr > oNode = outEdge->getDst(); - if (!oNode.expired()) { - const shared_ptr > node = oNode.lock(); - /*find node from nodesInEdges and remove edge*/ - auto edg_iter = nodesInEdges.find(node); - if (edg_iter != nodesInEdges.end()) { - edg_iter->second.erase(outEdge); - if (edg_iter->second.size() <= 0) { - TopoNode tn; - tn.node = node; - tn.outEdges = node->getOutEdges(); - noIncoming.push_back(tn); - nodesInEdges.erase(edg_iter); - } - } - // ASSERT(edg_iter == nodes.end()) - } - } - } - if (nodesInEdges.size() > 0) { - return false; - } - return true; - } - -private: - // Allocated nodes and edges. - unordered_map >, int> nodes; - unordered_map >, int> edges; -}; -} // namespace MNN diff --git a/source/core/FileLoader.cpp b/source/core/FileLoader.cpp index 6e8cc7b0..345f35f8 100644 --- a/source/core/FileLoader.cpp +++ b/source/core/FileLoader.cpp @@ -14,7 +14,7 @@ namespace MNN { FileLoader::FileLoader(const char* file) { #if defined(_MSC_VER) wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, file, -1, wFilename, sizeof(wFilename))) { + if (0 == MultiByteToWideChar(CP_ACP, 0, file, -1, wFilename, sizeof(wFilename))) { mFile = nullptr; return; } diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp index 6d075d2d..87072af4 100644 --- a/source/core/Interpreter.cpp +++ b/source/core/Interpreter.cpp @@ -18,6 +18,7 @@ #include "core/Pipeline.hpp" #include "core/RuntimeFactory.hpp" #include "core/Session.hpp" + namespace MNN { struct Content { @@ -81,12 +82,14 @@ Interpreter* Interpreter::createFromBufferInternal(Content* net) { MNN_PRINT("Buffer is null for create interpreter\n"); return nullptr; } +#ifndef MNN_BUILD_MINI flatbuffers::Verifier verify((const uint8_t*)(net->buffer.get()), net->buffer.size()); if (false == VerifyNetBuffer(verify)) { MNN_PRINT("Invalidate buffer to create interpreter\n"); delete net; return nullptr; } +#endif net->net = GetNet(net->buffer.get()); if (nullptr == net->net->oplists()) { MNN_ERROR("Model has no oplist\n"); @@ -317,9 +320,7 @@ void Interpreter::resizeSession(Session* session) { MNN_ERROR("The model buffer has been released. Can't resize session\n"); return; } - if (session->getNeedResize()) { - session->resize(); - } + session->resize(); } ErrorCode Interpreter::runSessionWithCallBack(const Session* session, const TensorCallBack& before, @@ -344,7 +345,9 @@ const Backend* Interpreter::getBackend(const Session* session, const Tensor* ten void Interpreter::releaseModel() { std::unique_lock _l(mNet->lock); - mNet->buffer.release(); + if (mNet->buffer.get() != nullptr && mNet->net->usage() != Usage_INFERENCE_STATIC) { + mNet->buffer.release(); + } mNet->cacheBuffer.release(); } @@ -410,15 +413,17 @@ bool Interpreter::getSessionInfo(const Session* session, SessionInfoCode code, v return session->getInfo(code, ptr); } -static Runtime* _getDefaultBackend(RuntimeInfo& rt) { +static void _getDefaultBackend(RuntimeInfo& rt) { auto defaultType = MNN_FORWARD_CPU; + if (rt.first.find(defaultType) != rt.first.end()) { + rt.second = rt.first[defaultType]; + } if (rt.second == nullptr) { Backend::Info info; info.type = defaultType; info.numThread = 1; rt.second.reset(RuntimeFactory::create(info)); } - return rt.second.get(); } RuntimeInfo Interpreter::createRuntime(const std::vector& configs) { RuntimeInfo res; @@ -436,8 +441,8 @@ RuntimeInfo Interpreter::createRuntime(const std::vector& config } mRuntimes[compute.type].reset(newBn); } - _getDefaultBackend(res); } + _getDefaultBackend(res); return res; } diff --git a/source/core/Macro.h b/source/core/Macro.h index 23f3700f..2f8160c2 100644 --- a/source/core/Macro.h +++ b/source/core/Macro.h @@ -17,6 +17,11 @@ #define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y)) #define ALIGN_UP4(x) ROUND_UP((x), 4) #define ALIGN_UP8(x) ROUND_UP((x), 8) + +// fraction length difference is 16bit. calculate the real value, it's about 0.00781 +#define F32_BF16_MAX_LOSS ((0xffff * 1.0f ) / ( 1 << 23 )) + + #ifndef MNN_USE_NEON #if (__arm__ || __aarch64__) && (defined(__ARM_NEON__) || defined(__ARM_NEON)) #define MNN_USE_NEON diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp index bd4f0296..f651d3bb 100644 --- a/source/core/OpCommonUtils.cpp +++ b/source/core/OpCommonUtils.cpp @@ -313,4 +313,60 @@ int OpCommonUtils::computeStride(int32_t* strides, const int* shape, int length) return stride; } +bool OpCommonUtils::opNeedContent(int type, int index) { + switch (type) { + case OpType_ZerosLike: + case OpType_ZeroGrad: + case OpType_Shape: + case OpType_Rank: + case OpType_Const: + case OpType_Size: + case OpType_PriorBox: + return false; + case OpType_Interp: + case OpType_Crop: + case OpType_Reshape: + case OpType_Reduction: + case OpType_Resize: + if (1 == index) { + return false; + } + break; + default: + break; + } + return true; +} +bool OpCommonUtils::opCompabilityForLowp(const Op* op) { + switch (op->type()) { + case OpType_Scale: + case OpType_Convolution: + case OpType_ConvolutionDepthwise: + case OpType_Deconvolution: + case OpType_DeconvolutionDepthwise: + case OpType_MatMul: + case OpType_BatchMatMul: + return true; + default: + break; + } + return false; +} + +std::pair OpCommonUtils::getQuantInfo(const std::vector& inputs) { + if (!inputs.empty()) { + for (auto t : inputs) { + if (TensorUtils::getDescribe(t)->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL + && !TensorUtils::getDescribe(t)->regions.empty()) { + t = TensorUtils::getDescribe(t)->regions[0].origin; + } + auto& quantAttr = TensorUtils::getDescribe(t)->quantAttr; + if (quantAttr != nullptr) { + return std::make_pair(true, quantAttr->type); + } + } + } + return std::make_pair(false, DataType_DT_FLOAT); +} + } // namespace MNN diff --git a/source/core/OpCommonUtils.hpp b/source/core/OpCommonUtils.hpp index 22615f2a..4eea9fb7 100644 --- a/source/core/OpCommonUtils.hpp +++ b/source/core/OpCommonUtils.hpp @@ -32,6 +32,11 @@ public: const SPLITS& dstSplits, int pack = 4); static void turnToPackRegion(const Tensor::InsideDescribe::Region& region, Tensor::InsideDescribe::Region& c4Region, const SPLITS& srcSplits, const SPLITS& dstSplits, int pack = 4); + static bool opNeedContent(int type, int index); + + // For lowp CPU Backend + static bool opCompabilityForLowp(const Op* op); + static std::pair getQuantInfo(const std::vector& inputs); }; } // namespace MNN diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp index 7f0d5390..82e73510 100644 --- a/source/core/Pipeline.cpp +++ b/source/core/Pipeline.cpp @@ -14,12 +14,7 @@ #include "core/WrapExecution.hpp" #include "geometry/GeometryComputerUtils.hpp" #include "shape/SizeComputer.hpp" -//#define MNN_OPEN_TIME_TRACE -#include -//#define MNN_DEBUG_TENSOR_SIZE -//#define MNN_DEBUG_PREPARE -#define MNN_FAST_RESIZE namespace MNN { OperatorInfo::OperatorInfo() { @@ -96,6 +91,8 @@ static bool _allocTensor(Tensor* t, Backend* curBackend) { void Pipeline::UnitInfo::setUp(const Command& command, int index) { if (nullptr != command.op->name()) { mContent->name = command.op->name()->str(); + } else if (!command.name.empty()) { + mContent->name = command.name; } else { char buffer[20]; sprintf(buffer, "%d", index); @@ -120,7 +117,9 @@ Pipeline::Pipeline(std::vector&& infos, std::shared_ptr< mBackend = backend; mAllocInput = allocInput; mInfo = std::move(infos); +#ifndef MNN_BUILD_MINI GeometryComputerUtils::buildConstantTensors(mInfo, mBackupBackend, !mAllocInput, mConstTensors, mMidConstTensors); +#endif } void Pipeline::cloneExecution(const std::map>& cache) { Execution* dst; @@ -135,24 +134,18 @@ void Pipeline::cloneExecution(const std::mapUnPack()); - builder.Finish(lastOffset); Command cmd; - cmd.buffer.resize(builder.GetSize()); - ::memcpy(cmd.buffer.data(), builder.GetBufferPointer(), cmd.buffer.size()); cmd.outputs = info.outputs; cmd.inputs = info.inputs; - cmd.op = flatbuffers::GetMutableRoot(cmd.buffer.data()); + cmd.op = info.op; mBuffer.command.push_back(cmd); // mBuffer.command.emplace_back(GeometryComputerUtils::makeCommand(info.op->UnPack(), info.inputs, // info.outputs)); } - return NO_ERROR; } else { #ifndef MNN_BUILD_MINI mContext.clear(); @@ -172,24 +165,169 @@ ErrorCode Pipeline::encode(bool isStatic) { } } mInit = true; - return GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo, mBuffer, mContext, mBackupBackend, mUseGeometry); + auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo, mBuffer, mContext, mBackupBackend, mUseGeometry); + if (res != NO_ERROR) { + return res; + } #endif } + bool isQuantModel = false; + // Set Op + for (auto& iter : mBuffer.command) { + if (!iter.buffer.empty()) { + iter.op = flatbuffers::GetRoot((void*)iter.buffer.data()); + } + for (auto t : iter.outputs) { + if (TensorUtils::getDescribe(t)->quantAttr.get() != nullptr) { + isQuantModel = true; + } + } + } + // Propagate Scale + if (isQuantModel) { + // get propagate map + using PropagateMap = std::map>; + PropagateMap forwardMap, backwardMap; + auto insertPropagateMap = [](PropagateMap& propagateMap, const Tensor* s, const Tensor* t) { + if (propagateMap.find(s) == propagateMap.end()) { + propagateMap[s] = std::set({t}); + } else { + propagateMap[s].insert(t); + } + }; + std::set propagateOpTypes = { OpType_Pooling, OpType_Raster, OpType_ReLU, OpType_ReLU6, + OpType_Interp, OpType_CropAndResize, OpType_ROIPooling, OpType_Gather, + OpType_GatherV2, OpType_GatherV2, OpType_ScatterNd }; + for (const auto& cmd : mBuffer.command) { + const auto type = cmd.op->type(); + const auto output = cmd.outputs[0]; + if (propagateOpTypes.find(type) != propagateOpTypes.end()) { + if (type == OpType_Raster) { + const auto des = MNN::TensorUtils::getDescribe(cmd.inputs[0]); + for (auto& r : des->regions) { + insertPropagateMap(forwardMap, r.origin, output); + insertPropagateMap(backwardMap, output, r.origin); + } + } else { + for (auto t : cmd.inputs) { + insertPropagateMap(forwardMap, t, output); + insertPropagateMap(backwardMap, output, t); + } + } + } + } + auto getStart = [&forwardMap, &backwardMap](bool forward) { + auto& propagateMap = forward ? forwardMap : backwardMap; + auto& antiMap = forward ? backwardMap : forwardMap; + // delete N->1 Map of Op + for (const auto& iter : antiMap) { + if (iter.second.size() > 1) { + for (auto t : iter.second) { + auto res = propagateMap.find(t); + if (res != propagateMap.end()) { + propagateMap.erase(res); + } + } + } + } + std::set root, leaf, start; + for (const auto& iter : propagateMap) { + root.insert(iter.first); + for (auto t : iter.second) { + leaf.insert(t); + } + } + std::set_difference(root.begin(), root.end(), leaf.begin(), leaf.end(), std::inserter(start, start.begin())); + return start; + }; + auto forwardStart = getStart(true); + auto backwardStart = getStart(false); + // propagate scale + auto propagateScale = [](PropagateMap& propagateMap, std::set& start) { + std::function scalePropagate = [&propagateMap, &scalePropagate](const Tensor* t) { + if (TensorUtils::getDescribe(t)->quantAttr.get() == nullptr) { + return false; + } + if (propagateMap.find(t) == propagateMap.end()) { + return false; + } + bool change = false; + for (auto x : propagateMap[t]) { + if (TensorUtils::getDescribe(x)->quantAttr != TensorUtils::getDescribe(t)->quantAttr) { + TensorUtils::getDescribe(x)->quantAttr = TensorUtils::getDescribe(t)->quantAttr; + change = true; + } + change |= scalePropagate(x); + } + return change; + }; + bool change = false; + for (auto t : start) { + change |= scalePropagate(t); + } + return change; + }; + for (int i = 0; i < 3 && (propagateScale(forwardMap, forwardStart) || propagateScale(backwardMap, backwardStart)); i++); + } + mExecutions.resize(mBuffer.command.size()); + for (int i = 0; i < mBuffer.command.size(); ++i) { + mExecutions[i] = nullptr; + } + /** Prepare DebugInfo*/ + if (supportDebug) { + mDebugInfos.resize(mBuffer.command.size()); + for (int i = 0; i < mBuffer.command.size(); ++i) { + mDebugInfos[i].setUp(mBuffer.command[i], i); + } + } return NO_ERROR; } -ErrorCode Pipeline::allocMemory(bool supportDebug) { - mExecutions.clear(); - mDebugInfos.clear(); - mBackend->onClearBuffer(); - mBackupBackend->onClearBuffer(); - - /** Prepare Execution And Alloc*/ - // Compute refCount +ErrorCode Pipeline::allocMemory() { + // Compute RefCount for (auto& iter : mBuffer.command) { - if (!iter.buffer.empty()) { - iter.op = flatbuffers::GetMutableRoot((void*)iter.buffer.data()); + for (auto t : iter.inputs) { + auto des = TensorUtils::getDescribe(t); + if (des->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) { + for (auto& r : des->regions) { + TensorUtils::getDescribe(r.origin)->useCount = 0; + if (nullptr != r.offset) { + TensorUtils::getDescribe(r.offset)->useCount = 0; + } + } + } else { + des->useCount = 0; + } } +#if 0 + // dump scale + { + printf("name: %s, inputs: { ", iter.name.c_str()); + auto realInputs = iter.inputs; + if (iter.op->type() == OpType_Raster) { + realInputs.clear(); + for (auto& r : TensorUtils::getDescribe(iter.inputs[0])->regions) { + realInputs.push_back(r.origin); + } + } + for (auto t : realInputs) { + printf("%p -> ", t); + if (TensorUtils::getDescribe(t)->quantAttr) { + printf("%f, ", TensorUtils::getDescribe(t)->quantAttr->scale); + } + } + printf("}, outputs: { "); + for (auto t : iter.outputs) { + printf("%p -> ", t); + if (TensorUtils::getDescribe(t)->quantAttr) { + printf("%f, ", TensorUtils::getDescribe(t)->quantAttr->scale); + } + } + printf(" }\n"); + } +#endif + } + for (auto& iter : mBuffer.command) { for (auto t : iter.inputs) { auto des = TensorUtils::getDescribe(t); if (des->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) { @@ -204,34 +342,47 @@ ErrorCode Pipeline::allocMemory(bool supportDebug) { } } } + mBackend->onClearBuffer(); + mBackupBackend->onClearBuffer(); + for (auto& c : mBuffer.command) { + for (auto& t : c.outputs) { + TensorUtils::getDescribe(t)->backend = nullptr; + } + } // Create Execution and Alloc mBackend->onResizeBegin(); - mExecutions.resize(mBuffer.command.size()); for (int i = 0; i < mBuffer.command.size(); ++i) { auto& iter = mBuffer.command[i]; // MNN_PRINT("%d - %s\n", i, EnumNameOpType(iter.op->type())); - mExecutions[i] = nullptr; - bool cached = false; - /** Cache origin execution for fast resize*/ - auto exeIter = mOriginExecution.find(iter.op); - if (exeIter != mOriginExecution.end()) { - mExecutions[i] = exeIter->second; - cached = true; - } - // Create exe + // MNN_PRINT("%s\n", iter.name.c_str()); if (nullptr == mExecutions[i]) { - mExecutions[i].reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op)); + bool cached = false; + /** Cache origin execution for fast resize*/ + auto exeIter = mOriginExecution.find(iter.op); + if (exeIter != mOriginExecution.end()) { + mExecutions[i] = exeIter->second; + cached = true; + } + // Create exe if (nullptr == mExecutions[i]) { - mExecutions[i].reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op)); + mExecutions[i].reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op)); if (nullptr == mExecutions[i]) { - MNN_ERROR("Create exection error : %d\n", iter.op->type()); - return NOT_SUPPORT; + mExecutions[i].reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op)); + if (nullptr == mExecutions[i]) { + MNN_ERROR("Create exection error : %d\n", iter.op->type()); + return NOT_SUPPORT; + } } } - } - // invalid means memory alloc failed - if (!mExecutions[i]->valid()) { - return OUT_OF_MEMORY; + // invalid means memory alloc failed + if (!mExecutions[i]->valid()) { + mExecutions[i] = nullptr; + return OUT_OF_MEMORY; + } + // FIXME: The cached execution may cause wrap error. Fix it in future + if ((!cached) && iter.buffer.empty() && (iter.op->type() != OpType_Raster) && (iter.op->type() != OpType_BinaryOp)) { + mOriginExecution.insert(std::make_pair(iter.op, mExecutions[i])); + } } auto curBackend = mExecutions[i]->backend(); // Alloc for Tensors @@ -294,21 +445,16 @@ ErrorCode Pipeline::allocMemory(bool supportDebug) { } } } - { auto code = allocFunction(iter.outputs); if (NO_ERROR != code) { return code; } } - // Wrap If needed - if (wrap && (!cached)) { + if (wrap) { mExecutions[i].reset(new WrapExecution(mBackupBackend.get(), mExecutions[i])); } - if ((!cached) && iter.buffer.empty() && (iter.op->type() != OpType_Raster)) { - mOriginExecution.insert(std::make_pair(iter.op, mExecutions[i])); - } auto code = mExecutions[i]->onResize(iter.inputs, iter.outputs); if (NO_ERROR != code) { return code; @@ -330,14 +476,6 @@ ErrorCode Pipeline::allocMemory(bool supportDebug) { } } mBackend->onResizeEnd(); - - /** Prepare DebugInfo*/ - if (supportDebug) { - mDebugInfos.resize(mBuffer.command.size()); - for (int i = 0; i < mBuffer.command.size(); ++i) { - mDebugInfos[i].setUp(mBuffer.command[i], i); - } - } return NO_ERROR; } diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp index aa95475d..adeafba5 100644 --- a/source/core/Pipeline.hpp +++ b/source/core/Pipeline.hpp @@ -43,9 +43,9 @@ public: 3. copy op, inputs and outputs tensor info to mBuffer static_model: 3; dynamic_model: 1,2,3 */ - ErrorCode encode(bool isStatic = false); + ErrorCode encode(bool isStatic = false, bool supportDebug = false); /** allocMemory: create Execution and alloc memory for every op */ - ErrorCode allocMemory(bool supportDebug = true); + ErrorCode allocMemory(); /** execute this pipline */ ErrorCode execute(); ErrorCode executeCallBack(const TensorCallBackWithInfo& before, const TensorCallBackWithInfo& after); diff --git a/source/core/Schedule.cpp b/source/core/Schedule.cpp index 46ec75e0..b7019b1e 100644 --- a/source/core/Schedule.cpp +++ b/source/core/Schedule.cpp @@ -10,8 +10,8 @@ #include #include #include +#include #include -#include "core/DirectedAcyclicGraph.hpp" #include "core/Macro.h" #include "core/RuntimeFactory.hpp" #include "core/TensorUtils.hpp" @@ -19,26 +19,10 @@ #include "utils/InitNet.hpp" //#define MNN_OPEN_TIME_TRACE #include +using namespace std; //#define MNN_AUTO_CHECK_COST namespace MNN { -class OpNodeDef : public NodeDef { -public: - OpNodeDef(Op* op) { - this->op = op; - } - -public: - virtual shared_ptr> makeNode() override { - shared_ptr> ptr = make_shared>(); - ptr->setData(this->op); - return ptr; - } - -private: - Op* op; -}; - MNNForwardType Schedule::getApprociateType(const ScheduleConfig& config) { MNNForwardType type = config.type; // FIXME: Support Auto determine @@ -63,6 +47,7 @@ static bool _setUpTensorInfo(std::vector>& allTensors, c bool valid = true; auto& tensors = allTensors; tensors.resize(net->tensorName()->size()); + if (net->usage() == Usage_INFERENCE_STATIC) { // static model will set all tensors' shape auto describes = net->extraTensorDescribe(); @@ -121,80 +106,6 @@ static bool _setUpTensorInfo(std::vector>& allTensors, c return valid; } -static int _findOpPosition(const std::string& opName, const Net* net) { - for (int i = 0; i < net->oplists()->size(); ++i) { - auto op = net->oplists()->GetAs(i); - if (opName == op->name()->str()) { - return i; - } - } - return -1; -} - -static bool _validateOp(const Op* op) { - if (nullptr == op->inputIndexes() && nullptr == op->outputIndexes()) { - return false; - } - if (nullptr == op->name()) { - return false; - } - return true; -} - -static vector generateOneSchedulePath(const Net* net, const int begin, const int end, - const vector>& allTensors) { - vector oplists; - for (int i = begin; i < end; ++i) { - auto op = net->oplists()->GetAs(i); - if (op->type() == OpType_Input || !_validateOp(op)) { - continue; - } - oplists.emplace_back(const_cast(op)); - } - return oplists; -} - -static vector> generateSchedulePath(const Net* net, const ScheduleConfig& configs, - const vector>& allTensors) { - vector> oplists; - vector inputs(configs.path.inputs); - vector outputs(configs.path.outputs); - auto maxSize = std::max(inputs.size(), outputs.size()); - inputs.resize(maxSize); - outputs.resize(maxSize); - - for (int i = 0; i < inputs.size(); i++) { - string in = inputs[i]; - string out = outputs[i]; - int start = 0; - int end = net->oplists()->size(); - if (in.length() > 0) { - auto pos = _findOpPosition(in, net); - if (-1 == pos) { - MNN_PRINT("Can't find %s op as start op\n", in.c_str()); - } else { - start = pos; - } - } - if (out.length() > 0) { - auto pos = _findOpPosition(out, net); - if (-1 == pos) { - MNN_PRINT("Can't find %s op as end op\n", out.c_str()); - } else { - end = pos + 1; - } - } - if (start > end) { - MNN_PRINT("op order incorrect end op '%s' before begin op '%s',please check!\n", out.c_str(), in.c_str()); - } else { - vector path = generateOneSchedulePath(net, start, end, allTensors); - oplists.emplace_back(path); - } - } - - return oplists; -} - static void generateScheduleGraph(vector& ops, const Net* net, const ScheduleConfig& configs, const vector>& allTensors) { if (configs.path.inputs.empty() && configs.path.outputs.empty()) { @@ -209,43 +120,105 @@ static void generateScheduleGraph(vector& ops, const Net* net, const } return; } - vector> paths = generateSchedulePath(net, configs, allTensors); + // 0: not set, 1: output, 2:input + std::vector tensorMask(net->tensorName()->size()); + ::memset(tensorMask.data(), 0, tensorMask.size() * sizeof(int)); - unique_ptr> graph(new DirectedAcyclicGraph()); - - // add Node - unordered_map>> opMaps; - for (vector path : paths) { - for (Op* op : path) { - if (opMaps.find(op) == opMaps.end()) { - OpNodeDef def(op); - shared_ptr> n = graph->AddNode(def); - opMaps.insert(make_pair(op, n)); - } - } + // 0: use, 1: no use + std::vector opMask(net->oplists()->size()); + ::memset(opMask.data(), 0, opMask.size() * sizeof(int)); + + // Set Initial Status + std::set inputNames; + std::set outputNames; + for (auto& n : configs.path.inputs) { + inputNames.insert(n); } - - // add edges - for (vector path : paths) { - shared_ptr> pre = nullptr; - for (Op* op : path) { - shared_ptr> n = opMaps[op]; - if (nullptr == pre) { - pre = n; - } else { - graph->AddEdge(pre, n); - pre = n; - } - } + for (auto& n : configs.path.outputs) { + outputNames.insert(n); } - ops.clear(); - vector>> order; - if (graph->GetPostOrder(order)) { - for (shared_ptr> n : order) { - ops.emplace_back(n->getData()); + if (configs.mode == ScheduleConfig::Path::Mode::Tensor) { + for (int i=0; itensorName()->GetAsString(i)->c_str(); + if (outputNames.find(name) != outputNames.end()) { + tensorMask[i] = 1; + } + // If both input/output, set as input + if (inputNames.find(name) != inputNames.end()) { + tensorMask[i] = 2; + } } } else { - MNN_PRINT("op graph have cycle,schedule failed\n"); + // Op Mode + for (int i=0; ioplists()->GetAs(i); + if (nullptr == op->name()) { + continue; + } + auto name = op->name()->c_str(); + if (outputNames.find(name) != outputNames.end()) { + opMask[i] = 1; + if (nullptr != op->outputIndexes()) { + for (int j=0; joutputIndexes()->size(); ++j) { + auto index = op->outputIndexes()->data()[j]; + if (tensorMask[index] != 2) { + tensorMask[index] = 1; + } + } + } + if (nullptr != op->inputIndexes()) { + for (int j=0; jinputIndexes()->size(); ++j) { + auto index = op->inputIndexes()->data()[j]; + if (tensorMask[index] != 2) { + tensorMask[index] = 1; + } + } + } + } + if (inputNames.find(name) != inputNames.end()) { + opMask[i] = 1; + if (nullptr != op->outputIndexes()) { + for (int j=0; joutputIndexes()->size(); ++j) { + auto index = op->outputIndexes()->data()[j]; + tensorMask[index] = 2; + } + } + } + } + } + + bool change = false; + do { + change = false; + for (int i=0; i 0) { + continue; + } + auto op = net->oplists()->GetAs(i); + if (nullptr != op->outputIndexes()) { + for (int j=0; joutputIndexes()->size(); ++j) { + auto index = op->outputIndexes()->data()[j]; + if (tensorMask[index] == 1) { + opMask[i] = 1; + change = true; + } + } + } + if (nullptr != op->inputIndexes() && opMask[i]) { + for (int j=0; jinputIndexes()->size(); ++j) { + auto index = op->inputIndexes()->data()[j]; + if (tensorMask[index] != 2) { + tensorMask[index] = 1; + } + } + } + } + } while (change); + + for (int i=0; i 0) { + ops.emplace_back(net->oplists()->GetAs(i)); + } } } diff --git a/source/core/Schedule.hpp b/source/core/Schedule.hpp index 26afbbdc..0a2b9782 100644 --- a/source/core/Schedule.hpp +++ b/source/core/Schedule.hpp @@ -14,7 +14,9 @@ #include #include #include +#include #include "core/Backend.hpp" +#include "core/TensorUtils.hpp" namespace MNN { diff --git a/source/core/Session.cpp b/source/core/Session.cpp index 7a3ff124..449c2e7b 100644 --- a/source/core/Session.cpp +++ b/source/core/Session.cpp @@ -34,12 +34,13 @@ Session::Session(Schedule::ScheduleInfo&& info, Interpreter::SessionMode callBac for (auto& iter : info.pipelineInfo) { auto rt = mRuntime.first.find(iter.first.type)->second.get(); auto cpuRuntime = mRuntime.second; - std::shared_ptr first(rt->onCreate()); + std::shared_ptr first(rt->onCreate(iter.first.user)); std::shared_ptr second; if (first->type() == MNN_FORWARD_CPU) { second = first; } else { - second.reset(cpuRuntime->onCreate()); + BackendConfig defaultConfig; + second.reset(cpuRuntime->onCreate(&defaultConfig)); } std::shared_ptr newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, rt->onGetCompilerType() == Runtime::Compiler_Geometry)); mPipelines.emplace_back(std::move(newPipeline)); @@ -125,28 +126,36 @@ void Session::_clearCache() { } ErrorCode Session::resize(bool isStatic) { - for (auto& iter : mRuntime.first) { - iter.second->onGabageCollect(100); - } - if (!isStatic) { - _clearCache(); - } - bool debug = mCallBackMode == Interpreter::Session_Debug; - // Turn Pipeline to Command Buffer and Malloc resource - // TODO: Seperate Schedule and Malloc - for (auto& iter : mPipelines) { - auto error = iter->encode(isStatic); - if (NO_ERROR != error) { - return error; + if (mNeedResize) { + if (!isStatic) { + _clearCache(); } - error = iter->allocMemory(debug); - if (NO_ERROR != error) { - return error; + bool debug = mCallBackMode == Interpreter::Session_Debug; + for (auto& iter : mPipelines) { + auto error = iter->encode(isStatic, debug); + if (NO_ERROR != error) { + return error; + } } + mNeedResize = false; + mNeedMalloc = true; } - mNeedResize = false; - for (auto& iter : mRuntime.first) { - iter.second->onGabageCollect(0); + if (mNeedMalloc) { + // Set needResize = true for easy for judge in runSession when error + mNeedResize = true; + // Turn Pipeline to Command Buffer and Malloc resource + // TODO: Seperate Schedule and Malloc + for (auto& iter : mPipelines) { + auto error = iter->allocMemory(); + if (NO_ERROR != error) { + return error; + } + } + for (auto& iter : mRuntime.first) { + iter.second->onGabageCollect(0); + } + mNeedMalloc = false; + mNeedResize = false; } return NO_ERROR; } @@ -156,7 +165,9 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const { auto dst = (float*)ptr; float summer = mRuntime.second->onGetMemoryInMB(); for (auto& r : mRuntime.first) { - summer += r.second->onGetMemoryInMB(); + if (r.second.get() != mRuntime.second.get()) { + summer += r.second->onGetMemoryInMB(); + } } *dst = summer; return true; diff --git a/source/core/Session.hpp b/source/core/Session.hpp index 6e86a4f1..d8a3f5df 100644 --- a/source/core/Session.hpp +++ b/source/core/Session.hpp @@ -54,13 +54,7 @@ public: * @return result code. */ ErrorCode resize(bool isStatic = false); - /** - * @brief check if needs resize. - * @return needs resize or not. - */ - bool getNeedResize() const { - return mNeedResize; - } + /** * @brief set if needs resize. * @param flag needs resize or not. @@ -69,6 +63,10 @@ public: mNeedResize = flag; } + void setNeedMalloc(bool flag = true) { + mNeedMalloc = flag; + } + public: /** * @brief get backend that create the tensor. @@ -132,6 +130,7 @@ private: std::map mOutputs; bool mNeedResize = true; bool mValid = true; + bool mNeedMalloc = true; Interpreter::SessionMode mCallBackMode; }; } // namespace MNN diff --git a/source/core/Tensor.cpp b/source/core/Tensor.cpp index f2815ad3..343af1c4 100644 --- a/source/core/Tensor.cpp +++ b/source/core/Tensor.cpp @@ -52,7 +52,10 @@ Tensor::Tensor(const Tensor* tensor, DimensionType type, bool allocMemory) { mBuffer.device = 0; mBuffer.host = nullptr; mBuffer.dim = &mDescribe->dims[0]; - + auto& quantAttr = TensorUtils::getDescribe(tensor)->quantAttr; + if (quantAttr && buffer.type == TensorUtils::DataTypeToHalideType(quantAttr->type)) { + mBuffer.type = halide_type_of(); + } for (int i = 0; i < buffer.dimensions; ++i) { mBuffer.dim[i].extent = buffer.dim[i].extent; } @@ -97,6 +100,10 @@ Tensor::Tensor(const Tensor* tensor, DimensionType type, bool allocMemory) { } TensorUtils::setLinearLayout(this); + for (int i = mBuffer.dimensions; i < 4; i++) { + mBuffer.dim[i].extent = 1; + } + if (allocMemory) { auto memorySize = size(); if (memorySize > 0) { diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp index 7afd9d40..8b811dce 100644 --- a/source/core/TensorUtils.cpp +++ b/source/core/TensorUtils.cpp @@ -373,14 +373,34 @@ static inline bool expandSrc(std::vector& src, std::vector& dst, std:: } return false; } +// expand stride and size with expand value +static inline bool expandStrideSize(int* src, int* dst, int* size, int& num, int expandValue) { +#define MNN_3_INT_INSERT(x, i, y) if (i == 2) { x[2] = y; } else if (i == 1) { x[2] = x[1]; x[1] = y; } else if (i == 0) { x[2] = x[1]; x[1] = x[0]; x[0] = y; } else { return false; } + for (int i = num-1; i >= 0; i--) { + int splitSize = expandValue / src[i]; + if (!(expandValue % src[i] || size[i] % splitSize)) { + MNN_3_INT_INSERT(src, i, expandValue) + MNN_3_INT_INSERT(dst, i, (splitSize * dst[i])) + size[i] /= splitSize; + MNN_3_INT_INSERT(size, (i+1), splitSize) + if (++num > 3) return false; + return true; + } + } + return false; +#undef MNN_3_INT_INSERT +} // fuse srcRegion and dstRegion to dstRegion if return true bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) { if (srcReg.offset != nullptr || dstReg.offset != nullptr) { return false; } + // src data isnot full data of dst - if (srcReg.dst.offset > dstReg.src.offset) { + if (srcReg.dst.offset > dstReg.src.offset || + srcReg.dst.stride[1] > srcReg.size[2] || + srcReg.dst.stride[2] > srcReg.size[1] * srcReg.size[2]) { return false; } int dstTotalSize = 1, srcTotalSize = 1; @@ -430,6 +450,76 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins dstReg.size[2] = srcReg.size[2]; return true; } +#define MNN_FAST_FUSE_WITHOUT_STL +#ifdef MNN_FAST_FUSE_WITHOUT_STL + // general fuse + int srcDst[3], srcSrc[3], dstSrc[3], dstDst[3], srcSize[3], dstSize[3], newSrc[3], dstStride[3], srcStride[3]; +#define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; } + MNN_3_INT_INIT(dstStride, -1) + MNN_3_INT_INIT(srcStride, -1) +#undef MNN_3_INT_INIT + int srcNum = 0, dstNum = 0, sizeNum = 0; + for (int i = 0; i < 3; i++) { + if (srcReg.size[i] > 1) { + srcStride[srcNum] = srcReg.dst.stride[i]; + srcDst[srcNum] = srcReg.dst.stride[i]; + srcSrc[srcNum] = srcReg.src.stride[i]; + srcSize[srcNum] = srcReg.size[i]; + srcNum++; + } + if (dstReg.size[i] > 1) { + dstStride[dstNum] = dstReg.src.stride[i]; + dstDst[dstNum] = dstReg.dst.stride[i]; + dstSrc[dstNum] = dstReg.src.stride[i]; + dstSize[dstNum] = dstReg.size[i]; + dstNum++; + } + } + sizeNum = dstNum; +#define MNN_3_INT_DIFF(r, x, y, i) if ((x[i] != y[0]) && (x[i] != y[1]) && (x[i] != y[2])) { if (r > 0) { return false; } else { r = x[i]; } } + int srcExtra = -1, dstExtra = -1; + MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 0) + MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 1) + MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 2) + MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 0) + MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 1) + MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 2) +#undef MNN_3_INT_DIFF + if (dstExtra > 0) { + if (!expandStrideSize(srcDst, srcSrc, srcSize, srcNum, dstExtra)) { + return false; + } + } + if (srcExtra > 0) { + if (!expandStrideSize(dstSrc, dstDst, dstSize, dstNum, srcExtra)) { + return false; + } + } + // reorder srcSrc to newSrc by align srcDst and dstSrc + for (int i = 0; i < dstNum; i++) { + int index = 0; + for (int j = 0; j < srcNum; j++) { + if (dstSrc[j] == srcDst[i]) { + index = j; + } + } + newSrc[index] = srcSrc[i]; + } + // set final size and set expandIdx if expand val is 1 + int expandIdx = -1; + if (dstNum > sizeNum) { + for (int i = 2; i >= 0; i--) { + if (i < dstNum) { + if (dstSize[i] == 1) { + expandIdx = i; + } + dstReg.size[i] = dstSize[i]; + } else { + dstReg.size[i] = 1; + } + } + } +#else // general fuse std::set dstStride, srcStride, dstDiff, srcDiff; std::vector dstDst, dstSrc, srcDst, srcSrc, newSrc, dstSize, srcSize; @@ -489,6 +579,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins } } } +#endif int idx = 0; for (int i = 0; i < 3; i++) { if (dstReg.size[i] > 1 || i == expandIdx) { @@ -523,4 +614,71 @@ Tensor::DimensionType TensorUtils::getDimType(const Tensor* t) { return Tensor::TENSORFLOW; } +halide_type_t TensorUtils::DataTypeToHalideType(DataType t) { + switch (t) { + case DataType_DT_DOUBLE: + case DataType_DT_FLOAT: + return halide_type_of(); + case DataType_DT_BFLOAT16: + return halide_type_t(halide_type_float, 16); + case DataType_DT_QINT32: + case DataType_DT_INT32: + case DataType_DT_BOOL: + case DataType_DT_INT64: + return halide_type_of(); + case DataType_DT_QINT8: + case DataType_DT_INT8: + return halide_type_of(); + case DataType_DT_QUINT8: + case DataType_DT_UINT8: + return halide_type_of(); + case DataType_DT_QUINT16: + case DataType_DT_UINT16: + return halide_type_of(); + case DataType_DT_QINT16: + case DataType_DT_INT16: + return halide_type_of(); + case DataType_DT_STRING: + default: + MNN_PRINT("Unsupported data type!"); + MNN_ASSERT(false); + return halide_type_of(); + } +} + +DataType TensorUtils::HaildeTypeToDataType(halide_type_t t) { + if (t == halide_type_of()) { + return DataType_DT_INT8; + } + if (t == halide_type_of()) { + return DataType_DT_INT16; + } + if (t == halide_type_of()) { + return DataType_DT_INT32; + } + if (t == halide_type_of()) { + return DataType_DT_INT64; + } + if (t == halide_type_of()) { + return DataType_DT_UINT8; + } + if (t == halide_type_of()) { + return DataType_DT_UINT16; + } + if (t == halide_type_t(halide_type_float, 16)) { + return DataType_DT_BFLOAT16; + } + if (t == halide_type_of()) { + return DataType_DT_FLOAT; + } + if (t == halide_type_of()) { + return DataType_DT_DOUBLE; + } + MNN_PRINT("Unsupported data type!"); + MNN_ASSERT(false); + return DataType_DT_INVALID; +} +float TensorUtils::getScale(const Tensor* t) { + return getDescribe(t)->quantAttr ? getDescribe(t)->quantAttr->scale : 0.f; +} } // namespace MNN diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp index eb44dff8..d94eab12 100644 --- a/source/core/TensorUtils.hpp +++ b/source/core/TensorUtils.hpp @@ -28,6 +28,13 @@ struct TensorArrayAttr { // the shape of element std::vector> elemShape; }; +struct QuantAttr { + float scale; + float zero = 0.0f; + float min = -128.0f; + float max = 127.0f; + DataType type = DataType_DT_INT8; +}; /** extra tensor info container */ struct Tensor::InsideDescribe { public: @@ -86,6 +93,8 @@ public: halide_dimension_t dims[MNN_MAX_TENSOR_DIM]; // TensorArray Attribute std::shared_ptr tensorArrayAttr; + // Tensor Quant Attribute + std::shared_ptr quantAttr; }; typedef Tensor::InsideDescribe::Usage TensorUsage; @@ -142,6 +151,9 @@ public: static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg); static void adjustTensorForCompability(Tensor* t); static Tensor::DimensionType getDimType(const Tensor* t); + static halide_type_t DataTypeToHalideType(DataType t); + static DataType HaildeTypeToDataType(halide_type_t t); + static float getScale(const Tensor* t); }; } // namespace MNN diff --git a/source/core/WrapExecution.cpp b/source/core/WrapExecution.cpp index 39ce6754..3adab598 100644 --- a/source/core/WrapExecution.cpp +++ b/source/core/WrapExecution.cpp @@ -37,6 +37,7 @@ Tensor* WrapExecution::_getCopyTensor(Tensor* inputTensor) { if (srcBackend->type() == mCPUBackend->type()) { std::shared_ptr wrapTensor(new Tensor); TensorUtils::copyShape(inputTensor, wrapTensor.get(), true); + TensorUtils::adjustTensorForCompability(wrapTensor.get()); wrapTensor->buffer().type = inputTensor->buffer().type; mInputMaps.insert(std::make_pair(inputTensor, std::make_tuple(dstBackend, dstBackend, wrapTensor))); return wrapTensor.get(); @@ -46,6 +47,7 @@ Tensor* WrapExecution::_getCopyTensor(Tensor* inputTensor) { std::shared_ptr wrapTensor(new Tensor); TensorUtils::copyShape(inputTensor, wrapTensor.get(), true); wrapTensor->buffer().type = inputTensor->buffer().type; + TensorUtils::adjustTensorForCompability(wrapTensor.get()); mInputMaps.insert(std::make_pair(inputTensor, std::make_tuple(mCPUBackend, srcBackend, wrapTensor))); return wrapTensor.get(); } @@ -54,6 +56,8 @@ Tensor* WrapExecution::_getCopyTensor(Tensor* inputTensor) { std::shared_ptr wrapTensor(new Tensor); TensorUtils::copyShape(inputTensor, midTensor.get(), true); TensorUtils::copyShape(inputTensor, wrapTensor.get(), true); + TensorUtils::adjustTensorForCompability(wrapTensor.get()); + TensorUtils::adjustTensorForCompability(midTensor.get()); TensorUtils::getDescribe(midTensor.get())->usage = TensorUtils::getDescribe(inputTensor)->usage; midTensor->buffer().type = inputTensor->buffer().type; wrapTensor->buffer().type = inputTensor->buffer().type; diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp index d1560074..72664443 100644 --- a/source/cv/ImageProcess.cpp +++ b/source/cv/ImageProcess.cpp @@ -19,6 +19,11 @@ #include "backend/cpu/CPUTensorConvert.hpp" #include #include "core/Backend.hpp" + +#ifdef _MSC_VER +#include "backend/cpu/x86_x64/cpu_id.h" +#endif + #define CACHE_SIZE 256 namespace MNN { namespace CV { @@ -45,7 +50,16 @@ ImageProcess::ImageProcess(const Config& config) { ImageProcess* ImageProcess::create(const Config& config, const Tensor* dstTensor) { // TODO Get dstTensor' backend - + #ifdef _MSC_VER + auto cpuFlags = libyuv::InitCpuFlags(); + bool support = true; + support = support && (cpuFlags & libyuv::kCpuHasSSSE3); // _mm_shuffle_epi8 + support = support && (cpuFlags & libyuv::kCpuHasSSE41); // _mm_cvtepu8_epi32 + if (!support) { + MNN_ERROR("CPU must support SSSE3 and SSE4.1 for using ImageProcess\n"); + return nullptr; + } + #endif return new ImageProcess(config); } @@ -318,14 +332,14 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid if (sta != 0 || end < count) { if (sourceBpp > 0) { if (sta > 0) { - ::memset(samplerDest, 0, sourceBpp * sta); + ::memset(samplerDest, mPaddingValue, sourceBpp * sta); } if (end < count) { - ::memset(samplerDest + end * sourceBpp, 0, (count - end) * sourceBpp); + ::memset(samplerDest + end * sourceBpp, mPaddingValue, (count - end) * sourceBpp); } } else { // TODO, Only support NV12 / NV21 - ::memset(samplerDest, 0, count); + ::memset(samplerDest, mPaddingValue, count); ::memset(samplerDest + count, 128, UP_DIV(count, 2) * 2); } } diff --git a/source/geometry/ConvertUtils.cpp b/source/geometry/ConvertUtils.cpp index 14538790..885f0200 100644 --- a/source/geometry/ConvertUtils.cpp +++ b/source/geometry/ConvertUtils.cpp @@ -104,18 +104,20 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output) { inputShape[i + offset] = input->length(i); } // Compute Strides - std::vector sepInputShape; - std::vector sepOutputShape; + int sepInputShapeSize = 0; + int sepOutputShapeSize = 0; + int sepInputShape[MNN_MAX_TENSOR_DIM]; + int sepOutputShape[MNN_MAX_TENSOR_DIM]; int currentInput = 1; int currentOutput = 1; for (int i = 0; i < outputDim; ++i) { if (inputShape[i] != output->length(i)) { if (1 < currentOutput) { - sepInputShape.emplace_back(currentInput); - sepOutputShape.emplace_back(currentOutput); + sepInputShape[sepInputShapeSize++] = currentInput; + sepOutputShape[sepOutputShapeSize++] = currentOutput; } - sepInputShape.emplace_back(inputShape[i]); - sepOutputShape.emplace_back(output->length(i)); + sepInputShape[sepInputShapeSize++] = (inputShape[i]); + sepOutputShape[sepOutputShapeSize++] = (output->length(i)); currentInput = 1; currentOutput = 1; } else { @@ -124,23 +126,23 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output) { } } if (currentOutput != 1 || currentInput != 1) { - sepInputShape.emplace_back(currentInput); - sepOutputShape.emplace_back(currentOutput); + sepInputShape[sepInputShapeSize++] = (currentInput); + sepOutputShape[sepOutputShapeSize++] = (currentOutput); } int seperateOutputStrides[MNN_MAX_TENSOR_DIM]; int seperateInputStrides[MNN_MAX_TENSOR_DIM]; - OpCommonUtils::computeStride(seperateOutputStrides, sepOutputShape.data(), sepOutputShape.size()); - OpCommonUtils::computeStride(seperateInputStrides, sepInputShape.data(), sepInputShape.size()); - for (int i = 0; i < sepInputShape.size(); ++i) { + OpCommonUtils::computeStride(seperateOutputStrides, sepOutputShape, sepOutputShapeSize); + OpCommonUtils::computeStride(seperateInputStrides, sepInputShape, sepInputShapeSize); + for (int i = 0; i < sepInputShapeSize; ++i) { if (1 == sepInputShape[i]) { seperateInputStrides[i] = 0; } } // Split region by size, use stride to determine src and dst mapping - int remainDimSize = sepInputShape.size() > 3 ? (int)sepInputShape.size() - 3 : 0; + int remainDimSize = sepInputShapeSize > 3 ? (int)sepInputShapeSize - 3 : 0; std::vector remainStride(remainDimSize + 1); - int remainSize = OpCommonUtils::computeStride(remainStride.data(), sepOutputShape.data(), remainDimSize); + int remainSize = OpCommonUtils::computeStride(remainStride.data(), sepOutputShape, remainDimSize); outputDes->regions.resize(remainSize); std::vector cords(remainDimSize + 1); for (int index = 0; index < remainSize; ++index) { @@ -152,7 +154,7 @@ void ConvertUtils::broadcastto(Tensor* input, Tensor* output) { } reg.origin = input; for (int i = 0; i < 3; ++i) { - auto match = (int)sepOutputShape.size() - i - 1; + auto match = (int)sepOutputShapeSize - i - 1; if (match < 0) { continue; } diff --git a/source/geometry/GeometryBatchMatMul.cpp b/source/geometry/GeometryBatchMatMul.cpp index 66604839..362d0ddb 100644 --- a/source/geometry/GeometryBatchMatMul.cpp +++ b/source/geometry/GeometryBatchMatMul.cpp @@ -11,6 +11,7 @@ #include "geometry/GeometryComputerUtils.hpp" namespace MNN { +#ifdef MNN_SUPPORT_GEOMETRY_LOOP class GeometryBatchMatMul : public GeometryComputer { public: @@ -203,10 +204,12 @@ public: return true; } }; - +#endif static void _create() { +#ifdef MNN_SUPPORT_GEOMETRY_LOOP std::shared_ptr comp(new GeometryBatchMatMul); GeometryComputer::registerGeometryComputer(comp, {OpType_MatMul}); +#endif } REGISTER_GEOMETRY(GeometryBatchMatMul, _create); diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp index db39c8ee..a4af55c8 100644 --- a/source/geometry/GeometryBinary.cpp +++ b/source/geometry/GeometryBinary.cpp @@ -20,6 +20,9 @@ public: auto inputL0 = input0->elementSize(); auto inputL1 = input1->elementSize(); auto outputSize = output->elementSize(); + auto inp0format = TensorUtils::getDescribe(inputs[0])->dimensionFormat; + auto inp1format = TensorUtils::getDescribe(inputs[1])->dimensionFormat; + auto outFormat = TensorUtils::getDescribe(output)->dimensionFormat; MNN_ASSERT(0 != inputL1 && 0 != inputL0 && 0 != outputSize); if (1 == inputL0 || 1 == inputL1) { // Can directly compute @@ -31,7 +34,7 @@ public: return true; } // Need Broadcast or same shape - if (outputSize != inputL0) { + if (outputSize != inputL0 || inp0format != outFormat) { std::shared_ptr newTensor(new Tensor); TensorUtils::copyShape(output, newTensor.get(), true); newTensor->buffer().type = output->buffer().type; @@ -39,7 +42,7 @@ public: input0 = newTensor.get(); res.extras.emplace_back(newTensor); } - if (outputSize != inputL1) { + if (outputSize != inputL1 || inp1format != outFormat) { std::shared_ptr newTensor(new Tensor); TensorUtils::copyShape(output, newTensor.get(), true); newTensor->buffer().type = output->buffer().type; diff --git a/source/geometry/GeometryComputer.cpp b/source/geometry/GeometryComputer.cpp index 8540a2c7..698d7ce6 100644 --- a/source/geometry/GeometryComputer.cpp +++ b/source/geometry/GeometryComputer.cpp @@ -36,7 +36,6 @@ GeometryComputer::Context::Context(std::shared_ptr allocBackend, bool p } void GeometryComputer::Context::clear() { - mRasterCache.clear(); pOutputs.clear(); } const std::vector>& GeometryComputer::Context::searchConst(const Op* op) const { @@ -64,10 +63,20 @@ std::shared_ptr GeometryComputer::Context::allocConst(const Op* key, con return tensor; } -Tensor* GeometryComputer::Context::getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd) { +bool GeometryComputer::Context::allocTensor(Tensor* tensor) { + auto res = mBackend->onAcquireBuffer(tensor, Backend::STATIC); + if (!res) { + return false; + } + TensorUtils::getDescribe(tensor)->usage = Tensor::InsideDescribe::CONSTANT; + TensorUtils::getDescribe(tensor)->backend = mBackend.get(); + return true; +} + +void GeometryComputer::Context::getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd) { auto srcDes = TensorUtils::getDescribe(src); if (srcDes->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL) { - return src; + return; } for (auto& input : srcDes->regions) { MNN_ASSERT(input.origin != src); @@ -82,86 +91,35 @@ Tensor* GeometryComputer::Context::getRasterCacheCreateRecurrse(Tensor* src, Com } inputDes = TensorUtils::getDescribe(input.origin); } - input.origin = getRasterCacheCreateRecurrse(input.origin, cmd); + getRasterCacheCreateRecurrse(input.origin, cmd); if (input.offset != nullptr) { - input.offset = getRasterCacheCreateRecurrse(input.offset, cmd); + getRasterCacheCreateRecurrse(input.offset, cmd); } MNN_ASSERT(TensorUtils::getDescribe(input.origin)->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL); } - return getRasterCacheCreate(src, cmd); + getRasterCacheCreate(src, cmd); } -std::shared_ptr GeometryComputer::Context::getCachedTensor(Tensor* t) { - auto findIter = mRasterCache.find(t); - if (findIter != mRasterCache.end()) { - return findIter->second; - } - auto tDes = TensorUtils::getDescribe(t); - for (auto& iter : mRasterCache) { - Tensor* s = iter.first; - bool shapeEqual = s->dimensions() == t->dimensions(); - shapeEqual &= s->getType() == t->getType(); - shapeEqual &= TensorUtils::getDescribe(s)->dimensionFormat == TensorUtils::getDescribe(t)->dimensionFormat; - for (int i = 0; i < t->dimensions() && shapeEqual; i++) { - shapeEqual &= s->length(i) == t->length(i); - } - if (!shapeEqual) { - continue; - } - auto sDes = TensorUtils::getDescribe(s); - if (tDes->regions.size() == sDes->regions.size()) { - bool equal = true; - for (int i = 0; i < sDes->regions.size(); i++) { - auto sReg = sDes->regions[i]; - auto tReg = tDes->regions[i]; - equal &= !::memcmp(&sReg, &tReg, sizeof(sReg)); - } - if (equal) { - return iter.second; - } - } - } - return nullptr; -} -Tensor* GeometryComputer::Context::getRasterCacheCreate(Tensor* src, CommandBuffer& cmdBuffer) { +void GeometryComputer::Context::getRasterCacheCreate(Tensor* src, CommandBuffer& cmdBuffer) { auto srcDes = TensorUtils::getDescribe(src); if (srcDes->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL) { - return src; - } - auto cached = getCachedTensor(src); - if (cached) { - return cached.get(); + return; } Command cmd; cmd.op = flatbuffers::GetRoot(mRasterOp.data()); - auto iter = pOutputs.find(src); - if (iter != pOutputs.end()) { - auto output = src; - auto oldDes = TensorUtils::getDescribe(output); - MNN_ASSERT(oldDes->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL); - std::shared_ptr newTensor(new Tensor); - TensorUtils::copyShape(output, newTensor.get(), true); - newTensor->buffer().type = output->getType(); - auto newDes = TensorUtils::getDescribe(newTensor.get()); - newDes->regions = std::move(oldDes->regions); - newDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; - oldDes->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND; - cmd.inputs = {newTensor.get()}; - cmd.outputs = {src}; - cmdBuffer.command.emplace_back(std::move(cmd)); - cmdBuffer.extras.emplace_back(newTensor); - pOutputs.erase(iter); - return src; - } + auto output = src; + auto oldDes = TensorUtils::getDescribe(output); + MNN_ASSERT(oldDes->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL); std::shared_ptr newTensor(new Tensor); - TensorUtils::copyShape(src, newTensor.get(), true); - newTensor->buffer().type = src->getType(); - TensorUtils::adjustTensorForCompability(newTensor.get()); - cmd.inputs = {src}; - cmd.outputs = {newTensor.get()}; + TensorUtils::copyShape(output, newTensor.get(), true); + newTensor->buffer().type = output->getType(); + auto newDes = TensorUtils::getDescribe(newTensor.get()); + newDes->regions = std::move(oldDes->regions); + newDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; + oldDes->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND; + cmd.inputs = {newTensor.get()}; + cmd.outputs = {src}; cmdBuffer.command.emplace_back(std::move(cmd)); cmdBuffer.extras.emplace_back(newTensor); - mRasterCache.insert(std::make_pair(src, newTensor)); - return newTensor.get(); } bool GeometryComputer::compute(const Op* op, const std::vector& inputs, const std::vector& outputs, GeometryComputer::Context& context, @@ -174,10 +132,10 @@ bool GeometryComputer::compute(const Op* op, const std::vector& inputs, continue; } if (!context.supportVirtual()) { - context.pOutputs.insert(outputs[i]); + context.pOutputs.emplace_back(outputs[i]); } else { if (oldDes->usage == Tensor::InsideDescribe::OUTPUT) { - context.pOutputs.insert(outputs[i]); + context.pOutputs.emplace_back(outputs[i]); } } } @@ -240,20 +198,4 @@ void GeometryComputer::init() { const GeometryComputer* GeometryComputer::search(int type) { return GeometryComputerManager::get()->search(type); } - -Command GeometryComputer::makeRaster(Tensor* input, Tensor* output) { - flatbuffers::FlatBufferBuilder builder; - OpBuilder opBuilder(builder); - opBuilder.add_type(OpType_Raster); - auto lastOffset = opBuilder.Finish(); - builder.Finish(lastOffset); - Command cmd; - cmd.buffer.resize(builder.GetSize()); - ::memcpy(cmd.buffer.data(), builder.GetBufferPointer(), cmd.buffer.size()); - cmd.inputs = {input}; - cmd.outputs = {output}; - cmd.op = flatbuffers::GetMutableRoot(cmd.buffer.data()); - return cmd; -} - } // namespace MNN diff --git a/source/geometry/GeometryComputer.hpp b/source/geometry/GeometryComputer.hpp index a5121f58..09d6d0e1 100644 --- a/source/geometry/GeometryComputer.hpp +++ b/source/geometry/GeometryComputer.hpp @@ -10,7 +10,6 @@ #define GeometryComputer_hpp #include #include -#include #include "MNN_generated.h" #include "core/Command.hpp" #include "core/TensorUtils.hpp" @@ -31,15 +30,14 @@ public: bool supportVirtual() const { return mPermitVirtual; } - Tensor* getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd); + void getRasterCacheCreateRecurrse(Tensor* src, CommandBuffer& cmd); const std::vector>& searchConst(const Op* op) const; std::shared_ptr allocConst(const Op* key, const std::vector& shape, halide_type_t type, Tensor::DimensionType dimType = Tensor::TENSORFLOW); - std::set pOutputs; + bool allocTensor(Tensor* tenosr); + std::vector pOutputs; private: - Tensor* getRasterCacheCreate(Tensor* src, CommandBuffer& cmd); - std::shared_ptr getCachedTensor(Tensor* t); - std::map> mRasterCache; + void getRasterCacheCreate(Tensor* src, CommandBuffer& cmd); std::map>> mConstTensors; std::vector> mEmpty; bool mPermitVirtual; @@ -48,7 +46,6 @@ public: }; static void init(); MNN_PUBLIC static const GeometryComputer* search(int type); - static Command makeRaster(Tensor* input, Tensor* output); static void registerGeometryComputer(std::shared_ptr comp, std::vector type); MNN_PUBLIC bool compute(const Op* op, const std::vector& inputs, const std::vector& outputs, Context& context, CommandBuffer& cmd) const; diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp index a62479ef..052d2f1d 100644 --- a/source/geometry/GeometryComputerUtils.cpp +++ b/source/geometry/GeometryComputerUtils.cpp @@ -11,6 +11,8 @@ #include "core/RuntimeFactory.hpp" #include "shape/SizeComputer.hpp" #include +#include "core/AutoStorage.h" + #ifdef MNN_BUILD_CODEGEN #include "OpFuse.hpp" #endif @@ -79,7 +81,7 @@ void GeometryComputerUtils::buildConstantTensors(std::vectorbackend = backupBackend.get(); - std::shared_ptr exe(backupBackend->onCreate(info.inputs, info.outputs, info.op)); + AutoRelease exe(backupBackend->onCreate(info.inputs, info.outputs, info.op)); exe->onResize(info.inputs, info.outputs); exe->onExecute(info.inputs, info.outputs); constTensors.emplace_back(info.outputs[0]); @@ -95,7 +97,7 @@ void GeometryComputerUtils::buildConstantTensors(std::vectorusage == Tensor::InsideDescribe::CONSTANT) { continue; } - if (SizeComputer::opNeedContent(info.op->type(), i)) { + if (OpCommonUtils::opNeedContent(info.op->type(), i)) { isConst = false; break; } @@ -113,7 +115,7 @@ void GeometryComputerUtils::buildConstantTensors(std::vectortype() == OpType_Const) { continue; } - auto dims = SizeComputer::needInputContent(info.op); + auto dims = SizeComputer::needInputContent(info.op, info.inputs.size()); for (auto index : dims) { if (index < info.inputs.size()) { if (TensorUtils::getDescribe(info.inputs[index])->usage != Tensor::InsideDescribe::CONSTANT) { @@ -214,8 +216,8 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( } GeometryComputerUtils::makeRaster(tempSrcbuffer, tempDstBuffer, ctx); for (auto& c : tempDstBuffer.command) { - std::shared_ptr exe(backupBackend->onCreate(c.inputs, c.outputs, c.op)); - if (nullptr == exe) { + AutoRelease exe(backupBackend->onCreate(c.inputs, c.outputs, c.op)); + if (nullptr == exe.get()) { MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str()); return NO_EXECUTION; } @@ -269,6 +271,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( } } GeometryComputerUtils::makeRaster(tmpBuffer, buffer, geoContext); +#ifdef MNN_ADD_NAME std::unordered_map nameIdx; auto getName = [&nameIdx](const std::string& name) { auto iter = nameIdx.find(name); @@ -319,6 +322,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( } } } +#endif } else { for (auto& info : infos) { if (info.type == Schedule::CONSTANT) { @@ -356,20 +360,20 @@ void GeometryComputerUtils::makeRaster(const CommandBuffer& srcBuffer, CommandBu auto type = op->type(); MNN_ASSERT(OpType_Raster != type); for (int i = 0; i < iter.inputs.size(); ++i) { - if (!SizeComputer::opNeedContent(type, i)) { + if (!OpCommonUtils::opNeedContent(type, i)) { continue; } auto des = TensorUtils::getDescribe(cmd.inputs[i]); MNN_ASSERT(des->tensorArrayAttr == nullptr); if (des->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) { - cmd.inputs[i] = ctx.getRasterCacheCreateRecurrse(cmd.inputs[i], dstBuffer); + ctx.getRasterCacheCreateRecurrse(cmd.inputs[i], dstBuffer); } } dstBuffer.command.emplace_back(std::move(cmd)); } auto& outputs = ctx.pOutputs; - while (!ctx.pOutputs.empty()) { - ctx.getRasterCacheCreateRecurrse(*ctx.pOutputs.begin(), dstBuffer); + for (auto& o : ctx.pOutputs) { + ctx.getRasterCacheCreateRecurrse(o, dstBuffer); } } Command GeometryComputerUtils::makeBinary(int type, Tensor* input0, Tensor* input1, Tensor* output) { diff --git a/source/geometry/GeometryCrop.cpp b/source/geometry/GeometryCrop.cpp index dc095e4f..19269988 100644 --- a/source/geometry/GeometryCrop.cpp +++ b/source/geometry/GeometryCrop.cpp @@ -190,7 +190,6 @@ public: } padRegion.emplace_back(r); } - MNN_ASSERT(padRegion.size() == seperateInputDims.size()); std::vector padRegionMod(padRegion.size()); int regionSize = OpCommonUtils::computeStride(padRegionMod.data(), padRegion.data(), padRegion.size()); int remainDimOffset = (int)remainStride.size(); diff --git a/source/geometry/GeometryOPRegister.cpp b/source/geometry/GeometryOPRegister.cpp index 646cea5b..52dde091 100644 --- a/source/geometry/GeometryOPRegister.cpp +++ b/source/geometry/GeometryOPRegister.cpp @@ -1,6 +1,7 @@ // This file is generated by Shell for ops register #include "geometry/GeometryComputer.hpp" namespace MNN { +extern void ___GeometryShape___create__(); extern void ___GeometryPermute___create__(); extern void ___GeometryTile___create__(); extern void ___GeometryReshape___create__(); @@ -29,14 +30,15 @@ extern void ___GeometryDilation2D___create__(); extern void ___GeometrySpaceToBatchND___create__(); extern void ___GeometryPooling3D___create__(); extern void ___GeometryELU___create__(); -extern void ___GeometryTanH___create__(); extern void ___GeometryThreshold___create__(); extern void ___GeometryLRN___create__(); extern void ___GeometrySlice___create__(); extern void ___GeometryConcat___create__(); +extern void ___GeometryUnary___create__(); extern void ___GeometryBinary___create__(); void registerGeometryOps() { +___GeometryShape___create__(); ___GeometryPermute___create__(); ___GeometryTile___create__(); ___GeometryReshape___create__(); @@ -65,11 +67,11 @@ ___GeometryDilation2D___create__(); ___GeometrySpaceToBatchND___create__(); ___GeometryPooling3D___create__(); ___GeometryELU___create__(); -___GeometryTanH___create__(); ___GeometryThreshold___create__(); ___GeometryLRN___create__(); ___GeometrySlice___create__(); ___GeometryConcat___create__(); +___GeometryUnary___create__(); ___GeometryBinary___create__(); } } diff --git a/source/geometry/GeometryReverseSequence.cpp b/source/geometry/GeometryReverseSequence.cpp index ff6c38f1..cf620f26 100644 --- a/source/geometry/GeometryReverseSequence.cpp +++ b/source/geometry/GeometryReverseSequence.cpp @@ -138,9 +138,50 @@ public: return true; } }; + +class GeometryReverse : public GeometryComputer { +public: + virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, + Context& context, CommandBuffer& res) const override { + MNN_ASSERT(1 == outputs.size()); + MNN_ASSERT(2 == inputs.size()); + auto output = outputs[0]; + auto input = inputs[0]; + int axis = inputs[1]->host()[0]; + int outsideSize = 1, insideSize = 1, reverseSize = input->length(axis); + for (int i = 0; i < input->dimensions(); i++) { + if (i < axis) { + outsideSize *= input->length(i); + } + if (i > axis) { + insideSize *= input->length(i); + } + } + auto outputDes = TensorUtils::getDescribe(output); + outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; + for (int i = 0; i < outsideSize; i++) { + Tensor::InsideDescribe::Region region; + region.origin = input; + + region.size[0] = reverseSize; + region.size[1] = insideSize; + region.size[2] = 1; + + region.src.offset = (i + 1) * reverseSize * insideSize - insideSize; + region.src.stride[0] = -insideSize; + + region.dst.offset = i * reverseSize * insideSize; + region.dst.stride[0] = insideSize; + outputDes->regions.emplace_back(std::move(region)); + } + return true; + } +}; static void _create() { std::shared_ptr comp(new GeometryReverseSequence); GeometryComputer::registerGeometryComputer(comp, {OpType_ReverseSequence}); + std::shared_ptr comp1(new GeometryReverse); + GeometryComputer::registerGeometryComputer(comp1, {OpType_Reverse}); } REGISTER_GEOMETRY(GeometryReverseSequence, _create); diff --git a/source/geometry/GeometryShape.cpp b/source/geometry/GeometryShape.cpp new file mode 100644 index 00000000..f4b824a8 --- /dev/null +++ b/source/geometry/GeometryShape.cpp @@ -0,0 +1,226 @@ +// +// GeometryShape.cpp +// MNN +// +// Created by MNN on 2021/03/08. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include +#include "core/AutoStorage.h" +#include "geometry/GeometryComputer.hpp" +#include "geometry/GeometryComputerUtils.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" + +namespace MNN { +class GeometryShape : public GeometryComputer { +public: + virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, + Context& context, CommandBuffer& res) const override { + if(!context.allocTensor(outputs[0])) { + return false; + } + auto& ib = inputs[0]->buffer(); + auto outputData = outputs[0]->host(); + auto inputFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat; + if ((inputFormat == MNN_DATA_FORMAT_NC4HW4) && TensorUtils::getDescribe(outputs[0])->dimensionFormat == MNN_DATA_FORMAT_NHWC) { + outputData[0] = ib.dim[0].extent; + outputData[1] = ib.dim[2].extent; + outputData[2] = ib.dim[3].extent; + outputData[3] = ib.dim[1].extent; + } else { + for (int i = 0; i < ib.dimensions; i++) { + outputData[i] = ib.dim[i].extent; + } + } + return true; + } +}; + +class GeometryRank : public GeometryComputer { +public: + virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, + Context& context, CommandBuffer& res) const override { + if(!context.allocTensor(outputs[0])) { + return false; + } + outputs[0]->host()[0] = inputs[0]->buffer().dimensions; + return true; + } +}; + +class GeometryPriorBox : public GeometryComputer { +public: + virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, + Context& context, CommandBuffer& res) const override { + if(!context.allocTensor(outputs[0])) { + return false; + } + AutoStorage mOutputData; + mOutputData.reset(outputs[0]->height() * outputs[0]->channel()); + + auto layer = op->main_as_PriorBox(); + auto input0 = inputs[0]; + const int w = input0->width(); + const int h = input0->height(); + + // image width, height + int imageW = layer->imageWidth(); + if (imageW <= 0) { + imageW = inputs[1]->width(); + } + int imageH = layer->imageHeight(); + if (imageH <= 0) { + imageH = inputs[1]->height(); + } + + // step width, height + float stepW = layer->stepWidth(); + if (stepW <= 0) { + stepW = (float)imageW / w; + } + float stepH = layer->stepHeight(); + if (stepH <= 0) { + stepH = (float)imageH / h; + } + + // sizes + auto minSizes = layer->minSizes(); + auto minSizeCount = minSizes ? minSizes->size() : 0; + auto maxSizes = layer->maxSizes(); + auto maxSizeCount = maxSizes ? maxSizes->size() : 0; + auto aspectRatios = layer->aspectRatios(); + bool flip = layer->flip(); + + std::vector aspectRatiosValue{1.0f}; + if (aspectRatios != nullptr) { + for (int i = 0; i < aspectRatios->size(); ++i) { + auto ratio = aspectRatios->data()[i]; + bool exist = false; + for (auto v : aspectRatiosValue) { + auto diff = v - ratio; + if (diff < 0) { + diff = -diff; + } + if (diff < 1e-6) { + exist = true; + break; + } + } + if (!exist) { + aspectRatiosValue.emplace_back(ratio); + if (flip) { + aspectRatiosValue.emplace_back(1.0f / ratio); + } + } + } + } + int priorCount = minSizeCount * aspectRatiosValue.size() + maxSizeCount; + + // boxes + float offset = layer->offset(); + auto boxesPtr = mOutputData.get(); + for (int i = 0; i < h; i++) { + float *box = boxesPtr + i * w * priorCount * 4; + float centerX = offset * stepW; + float centerY = offset * stepH + i * stepH; + for (int j = 0; j < w; j++, centerX += stepW) { + for (int k = 0; k < minSizeCount; k++) { + // min size box + float minSize = minSizes->data()[k]; + { + box[0] = (centerX - minSize * 0.5f) / imageW; + box[1] = (centerY - minSize * 0.5f) / imageH; + box[2] = (centerX + minSize * 0.5f) / imageW; + box[3] = (centerY + minSize * 0.5f) / imageH; + box += 4; + } + + // max size box + if (maxSizeCount > 0) { + float maxSize = maxSizes->data()[k]; + float ssqrt = sqrt(minSize * maxSize); + + box[0] = (centerX - ssqrt * 0.5f) / imageW; + box[1] = (centerY - ssqrt * 0.5f) / imageH; + box[2] = (centerX + ssqrt * 0.5f) / imageW; + box[3] = (centerY + ssqrt * 0.5f) / imageH; + box += 4; + } + + // aspect ratios + for (int p = 0; p < aspectRatiosValue.size(); p++) { + float arsqrt = sqrt(aspectRatiosValue[p]); + if (fabsf(arsqrt - 1.0f) < 1e-6) { + continue; + } + float boxW = minSize * arsqrt; + float boxH = minSize / arsqrt; + + box[0] = (centerX - boxW * 0.5f) / imageW; + box[1] = (centerY - boxH * 0.5f) / imageH; + box[2] = (centerX + boxW * 0.5f) / imageW; + box[3] = (centerY + boxH * 0.5f) / imageH; + box += 4; + } + } + } + } + + // clip + int oh = outputs[0]->height(); + if (layer->clip()) { + float *box = boxesPtr; + for (int i = 0; i < oh; i++) { + box[i] = std::min(std::max(box[i], 0.f), 1.f); + } + } + + // set variance + auto variances = layer->variances()->data(); + auto var = boxesPtr + oh; + for (int i = 0; i < oh / 4; i++) { + var[0] = variances[0]; + var[1] = variances[1]; + var[2] = variances[2]; + var[3] = variances[3]; + var += 4; + } + + // transform to output + auto outputData = outputs[0]->host(); + MNNPackC4(outputData, mOutputData.get(), outputs[0]->height(), outputs[0]->channel()); + return true; + } +}; + +class GeometrySize : public GeometryComputer { +public: + virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, + Context& context, CommandBuffer& res) const override { + if(!context.allocTensor(outputs[0])) { + return false; + } + int count = 1; + for (int i = 0; i < inputs[0]->buffer().dimensions; i++) { + count *= inputs[0]->buffer().dim[i].extent; + } + outputs[0]->host()[0] = count; + return true; + } +}; + +static void _create() { + std::shared_ptr comp(new GeometryShape); + GeometryComputer::registerGeometryComputer(comp, {OpType_Shape}); + std::shared_ptr comp1(new GeometryRank); + GeometryComputer::registerGeometryComputer(comp1, {OpType_Rank}); + std::shared_ptr comp2(new GeometryPriorBox); + GeometryComputer::registerGeometryComputer(comp2, {OpType_PriorBox}); + std::shared_ptr comp3(new GeometrySize); + GeometryComputer::registerGeometryComputer(comp3, {OpType_Size}); +} + +REGISTER_GEOMETRY(GeometryShape, _create); + +} // namespace MNN diff --git a/source/geometry/GeometrySpatialProduct.cpp b/source/geometry/GeometrySpatialProduct.cpp index 3d3a7a1e..ca3efa6c 100644 --- a/source/geometry/GeometrySpatialProduct.cpp +++ b/source/geometry/GeometrySpatialProduct.cpp @@ -31,12 +31,6 @@ public: int ih = input->height(); int ic = input->channel(); - MNN_ASSERT(ib == input1->batch()); - MNN_ASSERT(ic == input1->channel()); - MNN_ASSERT(ib == 1); - MNN_ASSERT(iw == input1->width()); - MNN_ASSERT(ih == input1->height()); - auto ob = output->batch(); auto oc = output->channel(); auto oh = output->height(); diff --git a/source/geometry/GeometryTensorArray.cpp b/source/geometry/GeometryTensorArray.cpp index e757b97a..8aec52c5 100644 --- a/source/geometry/GeometryTensorArray.cpp +++ b/source/geometry/GeometryTensorArray.cpp @@ -7,6 +7,7 @@ // #include "geometry/GeometryComputer.hpp" +#include "geometry/GeometryComputerUtils.hpp" #include "core/OpCommonUtils.hpp" namespace MNN { // get a pair @@ -14,25 +15,22 @@ static std::pair getElemSize(const Tensor* t, int index) { auto des = TensorUtils::getDescribe(t); auto shapes = des->tensorArrayAttr->elemShape; int elemSize = 1; - if (des->tensorArrayAttr->isIdenticalShape) { - if (shapes.size() == 1) { + if (!des->tensorArrayAttr->isIdenticalShape && shapes.size() > index) { + int offset = 0; + for (int i = 0; i <= index; i++) { elemSize = 1; - std::for_each(shapes[0].begin(), shapes[0].end(), [&elemSize](int x) { elemSize *= x; }); - return {index * elemSize, elemSize}; + std::for_each(shapes[i].begin(), shapes[i].end(), [&elemSize](int x) { elemSize *= x; }); + offset += elemSize; } + return {offset - elemSize, elemSize}; + } else if (shapes.size() >= 1) { + elemSize = 1; + std::for_each(shapes[0].begin(), shapes[0].end(), [&elemSize](int x) { elemSize *= x; }); + return {index * elemSize, elemSize}; } else { - if (shapes.size() > index) { - int offset = 0; - for (int i = 0; i <= index; i++) { - elemSize = 1; - std::for_each(shapes[i].begin(), shapes[i].end(), [&elemSize](int x) { elemSize *= x; }); - offset += elemSize; - } - return {offset - elemSize, elemSize}; - } + MNN_ASSERT(false); + return {0, 0}; } - MNN_ASSERT(false); - return {0, 0}; } static bool isFirstWrite(const Tensor::InsideDescribe* des) { @@ -85,26 +83,10 @@ public: MNN_ASSERT(false); return false; } - auto output = outputs[0]; - auto inputDes = TensorUtils::getDescribe(tensorArrayInput); - auto outputDes = TensorUtils::getDescribe(output); - outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; - outputDes->regions.resize(1); - auto& reg = outputDes->regions[0]; - auto sizeConst = context.allocConst(op, {}, halide_type_of()); - sizeConst->host()[0] = inputDes->tensorArrayAttr->arraySize; - reg.origin = sizeConst.get(); - reg.src.offset = 0; - reg.src.stride[0] = 1; - reg.src.stride[1] = 1; - reg.src.stride[2] = 1; - reg.dst.offset = 0; - reg.dst.stride[0] = 1; - reg.dst.stride[1] = 1; - reg.dst.stride[2] = 1; - reg.size[0] = 1; - reg.size[1] = 1; - reg.size[2] = 1; + if (!context.allocTensor(outputs[0])) { + return false; + } + outputs[0]->host()[0] = TensorUtils::getDescribe(tensorArrayInput)->tensorArrayAttr->arraySize; return true; } }; @@ -284,7 +266,6 @@ public: if (inDes->tensorArrayAttr == nullptr) { return false; } - MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape); int oldSize = inDes->tensorArrayAttr->arraySize; auto output = outputs[0]; int elemSize = getElemSize(output, 0).second; @@ -396,7 +377,6 @@ public: MNN_ASSERT(false); return false; } - //MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape); auto output = outputs[0]; auto outputDes = TensorUtils::getDescribe(output); outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; diff --git a/source/geometry/GeometryTanH.cpp b/source/geometry/GeometryUnary.cpp similarity index 55% rename from source/geometry/GeometryTanH.cpp rename to source/geometry/GeometryUnary.cpp index a6cfbef5..313d3291 100644 --- a/source/geometry/GeometryTanH.cpp +++ b/source/geometry/GeometryUnary.cpp @@ -1,5 +1,5 @@ // -// GeometryTanH.cpp +// GeometryUnary.cpp // MNN // // Created by MNN on 2020/07/27. @@ -11,24 +11,35 @@ #include "geometry/GeometryComputerUtils.hpp" namespace MNN { -class GeometryTanH : public GeometryComputer { +class GeometryUnary : public GeometryComputer { public: virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, Context& context, CommandBuffer& res) const override { MNN_ASSERT(1 == inputs.size()); MNN_ASSERT(1 == outputs.size()); auto input = inputs[0]; auto output = outputs[0]; - auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_TANH, input, output); + UnaryOpOperation unaryType; + switch (op->type()) { + case OpType_TanH: + unaryType = UnaryOpOperation_TANH; + break; + case OpType_Sigmoid: + unaryType = UnaryOpOperation_SIGMOID; + break; + default: + break; + } + auto cmd = GeometryComputerUtils::makeUnary(unaryType, input, output); res.command.emplace_back(std::move(cmd)); return true; } }; static void _create() { - std::shared_ptr comp(new GeometryTanH); - GeometryComputer::registerGeometryComputer(comp, {OpType_TanH}); + std::shared_ptr comp(new GeometryUnary); + GeometryComputer::registerGeometryComputer(comp, {OpType_TanH, OpType_Sigmoid}); } -REGISTER_GEOMETRY(GeometryTanH, _create); +REGISTER_GEOMETRY(GeometryUnary, _create); } // namespace MNN diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp index 0caa8c65..73625d6c 100644 --- a/source/math/Vec.hpp +++ b/source/math/Vec.hpp @@ -13,6 +13,13 @@ #ifdef MNN_USE_NEON #include #endif +#ifdef MNN_USE_SSE +#if defined(_MSC_VER) +#include +#else +#include +#endif +#endif namespace MNN { namespace Math { @@ -192,7 +199,7 @@ struct Vec { VecType dst = { vqneg_s8(value) }; return dst; } - + VecType& operator = (const VecType& lr) { value = lr.value; return *this; @@ -247,7 +254,18 @@ struct Vec { VecType dst = { vqnegq_s8(value) }; return dst; } - + + VecType operator*(int8_t lr) { + MNN_ERROR("Vec[NEON]: int8_t multiply maybe overflow!"); + VecType dst = { vmulq_s8(value, vdupq_n_s8(lr)) }; + return dst; + } + VecType operator*(const VecType& lr) { + MNN_ERROR("Vec[NEON]: int8_t multiply maybe overflow!"); + VecType dst = { vmulq_s8(value, lr.value) }; + return dst; + } + VecType& operator=(const VecType& lr) { value = lr.value; return *this; @@ -283,7 +301,6 @@ struct Vec { } }; #elif defined(MNN_USE_SSE) -#include template<> struct Vec { using VecType = Vec; @@ -354,6 +371,103 @@ struct Vec { return dst; } }; +template<> +struct Vec { + using VecType = Vec; + __m128i value; + VecType operator+(const VecType& lr) { + VecType dst = { _mm_add_epi8(value, lr.value) }; + return dst; + } + VecType operator-(const VecType& lr) { + VecType dst = { _mm_sub_epi8(value, lr.value) }; + return dst; + } + VecType operator*(const VecType& lr) { + MNN_ERROR("Vec[SSE]: int8_t multiply maybe overflow!"); + VecType dst = { _mul_epi8(value, lr.value) }; + return dst; + } + VecType operator*(float lr) { + MNN_ERROR("Vec[SSE]: int8_t multiply maybe overflow!"); + VecType dst = { _mul_epi8(value, _mm_set1_epi8(lr)) }; + return dst; + } + + VecType& operator=(const VecType& lr) { + value = lr.value; + return *this; + } + VecType operator-() { + VecType dst; +#if defined(_MSC_VER) + dst.value = _mm_sign_epi8(value, _mm_set1_epi8(-1)); // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC. +#else + dst.value = -value; +#endif + return dst; + } + Vec() { + } + Vec(const int8_t v) { + value = _mm_set1_epi8(v); + } + Vec(__m128i&& v) { + value = v; + } + Vec(const VecType& lr) { + value = lr.value; + } + float operator[](size_t i) { +#if defined(_MSC_VER) // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension. + int8_t temp[16]; + _mm_storeu_ps((float*)temp, _mm_castsi128_ps(value)); + return temp[i]; +#else + return value[i]; +#endif + } + static VecType load(const int8_t* addr) { + VecType v = { _mm_castps_si128(_mm_loadu_ps((const float*)addr)) }; + return v; + } + static void save(int8_t* addr, const VecType& v) { + _mm_storeu_ps((float*)addr, _mm_castsi128_ps(v.value)); + } + static VecType max(const VecType& v1, const VecType& v2) { + VecType dst = { _max_epi8(v1.value, v2.value) }; + return dst; + } + static VecType min(const VecType& v1, const VecType& v2) { + VecType dst = { _min_epi8(v1.value, v2.value) }; + return dst; + } +private: + static __m128i _max_epi8(__m128i a, __m128i b) { +#ifdef __SSE4_1__ + return _mm_max_epi8(a, b); +#else + auto mask0 = _mm_cmpgt_epi8(a, b); + auto mask1 = _mm_xor_si128(mask0, _mm_cmpeq_epi8(mask0, mask0)); + return _mm_or_si128(_mm_and_si128(mask0, a), _mm_and_si128(mask1, b)); +#endif + } + static __m128i _min_epi8(__m128i a, __m128i b) { +#ifdef __SSE4_1__ + return _mm_min_epi8(a, b); +#else + auto mask0 = _mm_cmplt_epi8(a, b); + auto mask1 = _mm_xor_si128(mask0, _mm_cmpeq_epi8(mask0, mask0)); + return _mm_or_si128(_mm_and_si128(mask0, a), _mm_and_si128(mask1, b)); +#endif + } + __m128i _mul_epi8(__m128i a, __m128i b) + { + __m128i dst_even = _mm_mullo_epi16(a, b); + __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8),_mm_srli_epi16(b, 8)); + return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even,8), 8)); + } +}; #endif } // namespace Math } // namespace MNN diff --git a/source/math/WingoradGenerater.cpp b/source/math/WingoradGenerater.cpp index cef7c586..f66285b0 100644 --- a/source/math/WingoradGenerater.cpp +++ b/source/math/WingoradGenerater.cpp @@ -192,7 +192,7 @@ std::shared_ptr WinogradGenerater::allocTransformWeight(const Tensor* so return std::shared_ptr(Tensor::createDevice({mB->length(0) * mB->length(1), coC4, ciC4, unitCi, unitCo})); } -void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor* source) { +void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor* source, bool ciFirst) { std::shared_ptr GT(Math::Matrix::create(mG->length(0), mG->length(1))); Math::Matrix::transpose(GT.get(), mG.get()); int ci = source->length(1); @@ -210,13 +210,19 @@ void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor* std::shared_ptr K_Transform(Math::Matrix::create(alpha, alpha)); auto weightPtr = source->host(); auto KTransformData = K_Transform->host(); + int lCi = unitCo; + int lCo = 1; + if (ciFirst) { + lCi = 1; + lCo = unitCi; + } for (int oz = 0; oz < co; ++oz) { auto srcOz = weightPtr + oz * ci * kernelCount * kernelCount; int ozC4 = oz / unitCo; int mx = oz % unitCo; - auto dstOz = weightDest->host() + weightDest->stride(1) * ozC4 + mx; + auto dstOz = weightDest->host() + weightDest->stride(1) * ozC4 + mx * lCo; for (int sz = 0; sz < ci; ++sz) { int szC4 = sz / unitCi; int my = sz % unitCi; @@ -227,7 +233,7 @@ void WinogradGenerater::transformWeight(const Tensor* weightDest, const Tensor* // K_Transform = M*GT Math::Matrix::multi(K_Transform.get(), M.get(), GT.get()); - auto dstSz = dstOz + szC4 * weightDest->stride(2) + unitCo * my; + auto dstSz = dstOz + szC4 * weightDest->stride(2) + my * lCi; for (int i = 0; i < alpha * alpha; ++i) { *(dstSz + i * weightDest->stride(0)) = KTransformData[i]; diff --git a/source/math/WingoradGenerater.hpp b/source/math/WingoradGenerater.hpp index 82383df4..7c56b1c0 100644 --- a/source/math/WingoradGenerater.hpp +++ b/source/math/WingoradGenerater.hpp @@ -29,7 +29,7 @@ public: } std::shared_ptr allocTransformWeight(const Tensor* originWeight, int unitCi = 4, int unitCo = 4, bool alloc = true); - void transformWeight(const Tensor* dest, const Tensor* source); + void transformWeight(const Tensor* dest, const Tensor* source, bool ciFirst = false); private: std::shared_ptr mA; diff --git a/source/shape/ShapeBinaryOp.cpp b/source/shape/ShapeBinaryOp.cpp index 81d3b72e..0834c9d0 100644 --- a/source/shape/ShapeBinaryOp.cpp +++ b/source/shape/ShapeBinaryOp.cpp @@ -50,85 +50,14 @@ public: MNN_PRINT("Error for binary op: input0's type != input1's type\n"); return false; } + if (input0->dimensions() < input1->dimensions()) { auto temp = input0; input0 = input1; input1 = temp; } TensorUtils::getDescribe(output)->dimensionFormat = TensorUtils::getDescribe(input0)->dimensionFormat; - - // if one scalar input -> just copy the other - if (input1->dimensions() == 0) { - TensorUtils::copyShape(input0, output); - return true; - } - - // else if inputs shape equals -> just copy any one - bool sameShape = true; - if (input0->dimensions() == input1->dimensions()) { - for (int i = 0; i < input0->buffer().dimensions; i++) { - if (input0->buffer().dim[i].extent != input1->buffer().dim[i].extent) { - sameShape = false; - break; - } - } - } - else { - sameShape = false; - } - if (sameShape) { - TensorUtils::copyShape(input0, output); - return true; - } - - // else if broadcast NOT supported -> failed - const int maxDimensions = input0->dimensions(); - const int diffDimension = input0->dimensions() - input1->dimensions(); - - std::vector outputDims(maxDimensions); - for (int i = 0; i < maxDimensions; i++) { - outputDims[i] = input0->buffer().dim[i].extent; - } - for (int i = diffDimension; i < maxDimensions; i++) { - const int input1Index = i - diffDimension; - int dim1 = input1->buffer().dim[input1Index].extent; - if (dim1 != outputDims[i] && (dim1 != 1 && outputDims[i] != 1)) { - if (op->name() == nullptr) { - MNN_PRINT("Don't support broadcast for binaryOp, i0=%d, i1=%d\n", outputDims[i], dim1); - } else { - MNN_PRINT("Don't support broadcast for binaryOp %s, i0=%d, i1=%d\n", op->name()->c_str(), outputDims[i], dim1); - } - MNN_PRINT("broadcast shape info:\n"); - MNN_PRINT("input0: "); - for (int ii = 0; ii < input0->dimensions(); ii++) { - MNN_PRINT("dim%d: %d ", ii, input0->buffer().dim[ii].extent); - } - MNN_PRINT("\n"); - MNN_PRINT("input1: "); - for (int ii = 0; ii < input1->dimensions(); ii++) { - MNN_PRINT("dim%d: %d ", ii, input1->buffer().dim[ii].extent); - } - MNN_PRINT("\n"); - - return false; - } - if (dim1 == outputDims[i]) { - continue; - } - if (dim1 != outputDims[i] && (dim1 == 1 || outputDims[i] == 1)) { - outputDims[i] = outputDims[i] * dim1; - } else { - MNN_PRINT("Error, the logic flow should never get here"); - return false; - } - } - - buffer.dimensions = maxDimensions; - for (int i = 0; i < maxDimensions; i++) { - buffer.dim[i].extent = outputDims[i]; - } - - return true; + return SizeComputer::computeBroadCastDims(op, inputs, outputs); } }; diff --git a/source/shape/ShapeConvolution.cpp b/source/shape/ShapeConvolution.cpp index 34124900..3f1cc919 100644 --- a/source/shape/ShapeConvolution.cpp +++ b/source/shape/ShapeConvolution.cpp @@ -67,8 +67,8 @@ public: MNN_ASSERT(layer->pads()->size() >= 4); int input_width = input->width() + layer->pads()->data()[1] + layer->pads()->data()[3]; int input_height = input->height() + layer->pads()->data()[0] + layer->pads()->data()[2]; - output_width = (input_width - kernel_width) / layer->strideX() + 1; - output_height = (input_height - kernel_height) / layer->strideY() + 1; + output_width = input_width < kernel_width ? 0 : (input_width - kernel_width) / layer->strideX() + 1; + output_height = input_height < kernel_height ? 0 : (input_height - kernel_height) / layer->strideY() + 1; } else { int input_width = input->width() + layer->padX() * 2; int input_height = input->height() + layer->padY() * 2; diff --git a/source/shape/ShapeConvolution3D.cpp b/source/shape/ShapeConvolution3D.cpp index 5729f863..9ca39ac3 100644 --- a/source/shape/ShapeConvolution3D.cpp +++ b/source/shape/ShapeConvolution3D.cpp @@ -50,20 +50,6 @@ public: TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat; return true; } - - virtual float onComputeFlops(const MNN::Op* op, const std::vector& inputs, - const std::vector& outputs) const override { - auto layer = op->main_as_Convolution3D()->common(); - int oSize = outputs[0]->length(1); - float flopsPerElement = inputs[0]->length(1); - for (int i = 0; i < 3; ++i) { - flopsPerElement *= (*layer->kernels())[i]; - oSize *= outputs[0]->length(i + 2); - } - float flops = oSize * flopsPerElement / FLOPS_M; - - return flops; - } }; REGISTER_SHAPE(Convolution3DSizeComputer, OpType_Convolution3D); diff --git a/source/shape/ShapeDeconvolution.cpp b/source/shape/ShapeDeconvolution.cpp index e334d868..f605a942 100644 --- a/source/shape/ShapeDeconvolution.cpp +++ b/source/shape/ShapeDeconvolution.cpp @@ -20,13 +20,8 @@ public: if (layer->hasOutputShape()) { MNN_ASSERT(inputs.size() >= 2); auto outputShape = inputs.back(); - if (outputShape->length(0) > 2) { - outputHeight = outputShape->host()[1]; - outputWidth = outputShape->host()[2]; - } else { - outputHeight = outputShape->host()[0]; - outputWidth = outputShape->host()[1]; - } + outputHeight = outputShape->host()[1]; + outputWidth = outputShape->host()[2]; } int input_width = inputTensor->width(); diff --git a/source/shape/ShapeGridSample.cpp b/source/shape/ShapeGridSample.cpp new file mode 100644 index 00000000..7a24f8ad --- /dev/null +++ b/source/shape/ShapeGridSample.cpp @@ -0,0 +1,53 @@ +// +// ShapeGridSample.cpp +// MNN +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "shape/SizeComputer.hpp" +#include "core/Macro.h" + +namespace MNN { +class GridSampleSizeComputer : public SizeComputer { + virtual bool onComputeSize(const MNN::Op *op, const std::vector &inputs, + const std::vector &outputs) const override { + // https://pytorch.org/docs/1.7.1/nn.functional.html?highlight=grid_sample#torch.nn.functional.grid_sample + // inputs[0] is input, inputs[1] is grid + MNN_ASSERT(2 == inputs.size()); + MNN_ASSERT(1 == outputs.size()); + MNN_ASSERT(4 == inputs[0]->buffer().dimensions && 4 == inputs[1]->buffer().dimensions); + MNN_ASSERT(inputs[0]->buffer().dim[0].extent == inputs[1]->buffer().dim[0].extent); + MNN_ASSERT(2 == inputs[1]->buffer().dim[3].extent); + + auto &ibInput0 = inputs[0]->buffer(); + auto &ibInput1 = inputs[1]->buffer(); + auto &ob = outputs[0]->buffer(); + + ob.dimensions = ibInput1.dimensions; + ob.dim[0].extent = ibInput0.dim[0].extent; + ob.dim[1].extent = ibInput0.dim[1].extent; + ob.dim[2].extent = ibInput1.dim[1].extent; + ob.dim[3].extent = ibInput1.dim[2].extent; + + ob.type = ibInput0.type; + TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe( + inputs[0])->dimensionFormat; + return true; + } + + virtual float onComputeFlops(const MNN::Op *op, const std::vector &inputs, + const std::vector &outputs) const override { + auto gridSampleParam = op->main_as_GridSample(); + if (gridSampleParam->mode() == MNN::SampleMode_BILINEAR) { + return 4 * SizeComputer::onComputeFlops(op, inputs, outputs); + } + + return SizeComputer::onComputeFlops(op, inputs, outputs); + } +}; + +REGISTER_SHAPE(GridSampleSizeComputer, OpType_GridSample); + +} // namespace MNN diff --git a/source/shape/ShapeMatMul.cpp b/source/shape/ShapeMatMul.cpp index 81bf7f6e..fa79fa05 100644 --- a/source/shape/ShapeMatMul.cpp +++ b/source/shape/ShapeMatMul.cpp @@ -15,7 +15,6 @@ namespace MNN { class MatMulSizeComputer : public SizeComputer { virtual bool onComputeSize(const MNN::Op* op, const std::vector& inputs, const std::vector& outputs) const override { - MNN_ASSERT(2 == inputs.size()); MNN_ASSERT(1 == outputs.size()); MNN_ASSERT(op->main_type() == OpParameter_MatMul); auto matMul = op->main_as_MatMul(); diff --git a/source/shape/ShapeRegister.cpp b/source/shape/ShapeRegister.cpp index eac987e3..9805d99e 100644 --- a/source/shape/ShapeRegister.cpp +++ b/source/shape/ShapeRegister.cpp @@ -18,6 +18,7 @@ extern void ___ReductionComputer__OpType_Reduction__(); extern void ___QuantizedAvgPoolComputer__OpType_QuantizedAvgPool__(); extern void ___ArgMaxComputer__OpType_ArgMax__(); extern void ___ArgMaxComputer__OpType_ArgMin__(); +extern void ___GridSampleSizeComputer__OpType_GridSample__(); extern void ___DepthToSpaceSizeComputer__OpType_DepthToSpace__(); extern void ___SliceTfComputer__OpType_SliceTf__(); extern void ___SelectSizeComputer__OpType_Select__(); @@ -116,6 +117,7 @@ ___ReductionComputer__OpType_Reduction__(); ___QuantizedAvgPoolComputer__OpType_QuantizedAvgPool__(); ___ArgMaxComputer__OpType_ArgMax__(); ___ArgMaxComputer__OpType_ArgMin__(); +___GridSampleSizeComputer__OpType_GridSample__(); ___DepthToSpaceSizeComputer__OpType_DepthToSpace__(); ___SliceTfComputer__OpType_SliceTf__(); ___SelectSizeComputer__OpType_Select__(); diff --git a/source/shape/ShapeSelect.cpp b/source/shape/ShapeSelect.cpp index 243e9365..64c82815 100644 --- a/source/shape/ShapeSelect.cpp +++ b/source/shape/ShapeSelect.cpp @@ -10,6 +10,7 @@ #include "core/Macro.h" #include "core/TensorUtils.hpp" namespace MNN { + class SelectSizeComputer : public SizeComputer { public: virtual bool onComputeSize(const MNN::Op* op, const std::vector& inputs, @@ -18,9 +19,11 @@ public: MNN_ASSERT(1 == outputs.size()); const auto& ib = inputs[1]->buffer(); auto& ob = outputs[0]->buffer(); - memcpy(ob.dim, ib.dim, sizeof(halide_dimension_t) * ib.dimensions); - ob.dimensions = ib.dimensions; ob.type = inputs[1]->buffer().type; + bool res = SizeComputer::computeBroadCastDims(op, inputs, outputs); + if (!res) { + return false; + } TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[1])->dimensionFormat; return true; } diff --git a/source/shape/ShapeShape.cpp b/source/shape/ShapeShape.cpp index 3ef775df..48186d4c 100644 --- a/source/shape/ShapeShape.cpp +++ b/source/shape/ShapeShape.cpp @@ -30,6 +30,9 @@ class ShapeSizeComputer : public SizeComputer { } else { ob.dim[0].extent = ib.dimensions; } + if (ib.dimensions == 0) { + return false; + } return true; } }; diff --git a/source/shape/ShapeTensorArray.cpp b/source/shape/ShapeTensorArray.cpp index af9c67b5..c874b357 100644 --- a/source/shape/ShapeTensorArray.cpp +++ b/source/shape/ShapeTensorArray.cpp @@ -103,19 +103,13 @@ class TensorArrayReadComputer : public SizeComputer { return false; } std::vector readElemShape; - if (des->tensorArrayAttr->isIdenticalShape) { - if (des->tensorArrayAttr->elemShape.size() == 1) { - readElemShape = des->tensorArrayAttr->elemShape[0]; - } else { - MNN_ASSERT(false); - } + int readIndex = inputs[1]->host()[0]; + if (!des->tensorArrayAttr->isIdenticalShape && des->tensorArrayAttr->elemShape.size() > readIndex) { + readElemShape = des->tensorArrayAttr->elemShape[readIndex]; + } else if (des->tensorArrayAttr->elemShape.size() >= 1) { + readElemShape = des->tensorArrayAttr->elemShape[0]; } else { - int readIndex = inputs[1]->host()[0]; - if (des->tensorArrayAttr->elemShape.size() > readIndex) { - readElemShape = des->tensorArrayAttr->elemShape[readIndex]; - } else { - MNN_ASSERT(false); - } + MNN_ASSERT(false); } outputs[0]->setType(op->main_as_TensorArray()->T()); outputs[0]->buffer().dimensions = readElemShape.size(); @@ -184,7 +178,6 @@ class TensorArrayGatherComputer : public SizeComputer { MNN_ASSERT(false); return false; } - MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape); auto param = op->main_as_TensorArray(); outputs[0]->setType(param->T()); outDes->dimensionFormat = inDes->dimensionFormat; @@ -228,7 +221,6 @@ class TensorArrayScatterComputer : public SizeComputer { MNN_ASSERT(false); return false; } - MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape); copyTensorArrayAttribute(inputs[3], outputs[0]); for (int i = 0; i < inputs[1]->length(0); i++) { int writeIndex = inputs[1]->host()[i]; @@ -304,9 +296,8 @@ class TensorArrayConcatComputer : public SizeComputer { MNN_ASSERT(false); return false; } - //MNN_ASSERT(inDes->tensorArrayAttr->isIdenticalShape); outputs[0]->setType(op->main_as_TensorArray()->T()); - if (inDes->tensorArrayAttr->elemShape.size() == 1) { + if (inDes->tensorArrayAttr->elemShape.size() >= 1) { outputs[0]->buffer().dimensions = inDes->tensorArrayAttr->elemShape[0].size() + 1; outputs[0]->setLength(0, inDes->tensorArrayAttr->arraySize); for (int i = 0; i < inDes->tensorArrayAttr->elemShape[0].size(); i++) { diff --git a/source/shape/ShapeTranspose.cpp b/source/shape/ShapeTranspose.cpp index fb9894a2..95bf0618 100644 --- a/source/shape/ShapeTranspose.cpp +++ b/source/shape/ShapeTranspose.cpp @@ -17,17 +17,10 @@ class TransposeComputer : public SizeComputer { const Tensor* input = inputs[0]; Tensor* perm = inputs[1]; const int dims = input->buffer().dimensions; - MNN_ASSERT(dims == perm->buffer().dim[0].extent); - - std::vector permutation; - if (perm->getType().code == halide_type_int && 32 == perm->getType().bits) { - for (int i = 0; i < perm->buffer().dim[0].extent; i++) { - permutation.push_back(perm->host()[i]); - } - } else { - MNN_ASSERT(false); + if (perm->getType().code != halide_type_int || 32 != perm->getType().bits || dims != perm->buffer().dim[0].extent) { + return false; } - + auto permutation = perm->host(); outputs[0]->buffer().dimensions = dims; outputs[0]->buffer().type = input->getType(); for (int i = 0; i < dims; ++i) { diff --git a/source/shape/SizeComputer.cpp b/source/shape/SizeComputer.cpp index f48aa00c..eddb6bdb 100644 --- a/source/shape/SizeComputer.cpp +++ b/source/shape/SizeComputer.cpp @@ -50,30 +50,7 @@ float SizeComputer::onComputeFlops(const MNN::Op* op, const std::vector MNN_ASSERT(outputs.size() >= 1); return (float)outputs[0]->elementSize() / 1024.0f / 1024.0f; } -bool SizeComputer::opNeedContent(OpType type, int index) { - switch (type) { - case OpType_ZerosLike: - case OpType_ZeroGrad: - case OpType_Shape: - case OpType_Rank: - case OpType_Const: - case OpType_Size: - case OpType_PriorBox: - return false; - case OpType_Interp: - case OpType_Crop: - case OpType_Reshape: - case OpType_Reduction: - case OpType_Resize: - if (1 == index) { - return false; - } - break; - default: - break; - } - return true; -} + float SizeComputer::computeFlops(const MNN::Op* op, const std::vector& inputs, const std::vector& outputs) { auto computeFactory = SizeComputerSuite::get(); @@ -153,10 +130,16 @@ bool SizeComputer::computeOutputSize(const MNN::Op* op, const std::vector SizeComputer::needInputContent(const MNN::Op* op) { +std::vector SizeComputer::needInputContent(const MNN::Op* op, int inputSize) { auto computeFactory = SizeComputerSuite::get(); // When op is nullptr, it means a copy op if (nullptr != op) { + // when hasOutputShape = true, deconv last is outputShape + if (op->type() == OpType_Deconvolution && op->main_as_Convolution2D() && op->main_as_Convolution2D()->common()) { + if (op->main_as_Convolution2D()->common()->hasOutputShape()) { + return std::vector{ inputSize - 1 }; + } + } auto computer = computeFactory->search(op->type()); if (nullptr != computer) { return computer->mNeedContentInputIndex; @@ -164,5 +147,48 @@ std::vector SizeComputer::needInputContent(const MNN::Op* op) { } return std::vector{}; } - +bool SizeComputer::computeBroadCastDims(const MNN::Op* op, const std::vector& inputs, + const std::vector& outputs) { + int maxDimensions = inputs[0]->dimensions(); + int maxIndex = 0; + for (int index=1; index < inputs.size(); ++index) { + if (inputs[index]->dimensions() > maxDimensions) { + maxDimensions = inputs[index]->dimensions(); + maxIndex = index; + } + } + int outputDims[MNN_MAX_TENSOR_DIM]; + for (int i = 0; i < maxDimensions; i++) { + outputDims[i] = inputs[maxIndex]->length(i); + } + for (int index=0; index < inputs.size(); ++index) { + if (index == maxIndex) { + continue; + } + auto input1 = inputs[index]; + auto input0 = inputs[maxIndex]; + const int diffDimension = maxDimensions - input1->dimensions(); + for (int i = diffDimension; i < maxDimensions; i++) { + const int input1Index = i - diffDimension; + int dim1 = input1->buffer().dim[input1Index].extent; + if (dim1 != outputDims[i] && (dim1 != 1 && outputDims[i] != 1)) { + return false; + } + if (dim1 == outputDims[i]) { + continue; + } + if (dim1 != outputDims[i] && (dim1 == 1 || outputDims[i] == 1)) { + outputDims[i] = outputDims[i] * dim1; + } else { + return false; + } + } + } + auto& ob = outputs[0]->buffer(); + ob.dimensions = maxDimensions; + for (int i = 0; i < maxDimensions; i++) { + ob.dim[i].extent = outputDims[i]; + } + return true; +} } // namespace MNN diff --git a/source/shape/SizeComputer.hpp b/source/shape/SizeComputer.hpp index d8c42ba0..b26907ae 100644 --- a/source/shape/SizeComputer.hpp +++ b/source/shape/SizeComputer.hpp @@ -66,9 +66,10 @@ public: static float computeFlops(const MNN::Op* op, const std::vector& inputs, const std::vector& outputs); - static std::vector needInputContent(const MNN::Op* op); - static bool opNeedContent(const MNN::OpType type, int index); + static bool computeBroadCastDims(const MNN::Op* op, const std::vector& inputs, + const std::vector& outputs); + static std::vector needInputContent(const MNN::Op* op, int inputSize); private: std::vector mNeedContentInputIndex; }; diff --git a/source/utils/InitNet.cpp b/source/utils/InitNet.cpp index 2b72e0af..a2fe6fb2 100644 --- a/source/utils/InitNet.cpp +++ b/source/utils/InitNet.cpp @@ -11,10 +11,26 @@ namespace MNN { bool initTensors(std::vector>& tensors, const Net* net) { + auto describes = net->extraTensorDescribe(); + std::vector des(tensors.size()); + if (describes) { + for (int i = 0; i < describes->size(); i++) { + int index = describes->GetAs(i)->index(); + des[index] = describes->GetAs(i); + } + } bool valid = true; for (int i = 0; i < tensors.size(); ++i) { tensors[i].reset(new Tensor(4)); // NCHW, TODO tensors[i]->setType(DataType_DT_FLOAT); + if (des[i] != nullptr && des[i]->quantInfo()) { + TensorUtils::getDescribe(tensors[i].get())->quantAttr.reset(new QuantAttr); + auto quant = TensorUtils::getDescribe(tensors[i].get())->quantAttr.get(); + quant->scale = des[i]->quantInfo()->scale(); + quant->zero = des[i]->quantInfo()->zero(); + quant->min = des[i]->quantInfo()->min(); + quant->max = des[i]->quantInfo()->max(); + } } // Set Input Tensor, if the type of input is not the same with ExtraTensorDescribe, use input parameter for (int opIndex = 0; opIndex < net->oplists()->size(); ++opIndex) { diff --git a/test/TestUtils.h b/test/TestUtils.h index 58ec9e47..57d1159e 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -57,6 +57,7 @@ bool checkVectorByRelativeError(const T* result, const T* rightData, int size, f MNN_ASSERT(result != nullptr); MNN_ASSERT(rightData != nullptr); MNN_ASSERT(size >= 0); + float maxValue = 0.0f; for(int i = 0; i < size; ++i){ maxValue = fmax(fabs(rightData[i]), maxValue); diff --git a/test/core/BackendTest.cpp b/test/core/BackendTest.cpp index 90b526e3..680a9e30 100644 --- a/test/core/BackendTest.cpp +++ b/test/core/BackendTest.cpp @@ -11,10 +11,12 @@ #include #include "MNNTestSuite.h" #include "core/Backend.hpp" +#include "core/Macro.h" using namespace MNN; -void NCHW2NHWC(const float* source, float* dest, int b, int h, int w, int c) { +template +void NCHW2NHWC(const T* source, T* dest, int b, int h, int w, int c) { int sourceBatchsize = h * w * c; int destBatchSize = sourceBatchsize; for (int bi = 0; bi < b; ++bi) { @@ -34,13 +36,14 @@ void NCHW2NHWC(const float* source, float* dest, int b, int h, int w, int c) { } } -void MNNTensorConvertNHWCToNC4HW4(float* dst, const float* src, size_t area, size_t depth) { +template +void MNNTensorConvertNHWCToNC4HW4(T* dst, const T* src, size_t area, size_t depth) { int c = (int)depth; int cDiv4 = c / 4; int cAlign = cDiv4 * 4; for (int hi = 0; hi < area; ++hi) { - const float* srcHeight = src + hi * c; - float* dstHeight = dst + hi * 4; + const auto srcHeight = src + hi * c; + auto dstHeight = dst + hi * 4; for (int ci = 0; ci < cDiv4; ++ci) { for (int i = 0; i < 4; ++i) { dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i]; @@ -57,8 +60,8 @@ void MNNTensorConvertNHWCToNC4HW4(float* dst, const float* src, size_t area, siz auto dstAlign = dst + area * cAlign; for (int hi = 0; hi < area; ++hi) { - const float* srcHeight = srcAlign + hi * c; - float* dstHeight = dstAlign + hi * 4; + const auto srcHeight = srcAlign + hi * c; + auto dstHeight = dstAlign + hi * 4; for (int i = 0; i < 4; ++i) { dstHeight[i] = 0; @@ -70,13 +73,14 @@ void MNNTensorConvertNHWCToNC4HW4(float* dst, const float* src, size_t area, siz } } -void MNNTensorConvertNC4HW4ToNHWC(float* dst, const float* src, size_t area, size_t depth) { +template +void MNNTensorConvertNC4HW4ToNHWC(T* dst, const T* src, size_t area, size_t depth) { int c = (int)depth; int cDiv4 = c / 4; int cAlign = cDiv4 * 4; for (int hi = 0; hi < area; ++hi) { - const float* srcHeight = src + hi * 4; - float* dstHeight = dst + hi * c; + const auto srcHeight = src + hi * 4; + auto dstHeight = dst + hi * c; for (int ci = 0; ci < cDiv4; ++ci) { for (int i = 0; i < 4; ++i) { dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i]; @@ -93,8 +97,8 @@ void MNNTensorConvertNC4HW4ToNHWC(float* dst, const float* src, size_t area, siz auto dstAlign = dst + cAlign; for (int hi = 0; hi < area; ++hi) { - const float* srcHeight = srcAlign + hi * 4; - float* dstHeight = dstAlign + hi * c; + const auto srcHeight = srcAlign + hi * 4; + auto dstHeight = dstAlign + hi * c; for (int ci = 0; ci < cReamin; ++ci) { dstHeight[ci] = srcHeight[ci]; @@ -102,7 +106,8 @@ void MNNTensorConvertNC4HW4ToNHWC(float* dst, const float* src, size_t area, siz } } -void NHWC2NCHW(const float* source, float* dest, int b, int h, int w, int c) { +template +void NHWC2NCHW(const T* source, T* dest, int b, int h, int w, int c) { int sourceBatchsize = h * w * c; int destBatchSize = sourceBatchsize; for (int bi = 0; bi < b; ++bi) { @@ -151,11 +156,59 @@ bool nhwc_2_nhwc_uint8(std::shared_ptr bn) { return true; } -bool NC4HW4_2_NC4HW4_float(std::shared_ptr bn) { - MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n"); +template +bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr bn) { + MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_IntType result ! ========= \n"); std::shared_ptr hostTensor( - Tensor::create(std::vector{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4)); + Tensor::create(std::vector{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4)); + auto elementSize = hostTensor->elementSize(); + auto hostData = hostTensor->host(); + for (int i = 0; i < elementSize; ++i) { + int flagRandom = i % 255; + hostData[i] = flagRandom; + } + + std::shared_ptr deviceTensor_pre(Tensor::createDevice(std::vector{1, 224, 224, 8}, Tensor::CAFFE_C4)); + bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC); + std::shared_ptr deviceTensor(Tensor::createDevice(std::vector{1, 224, 224, 8}, Tensor::CAFFE_C4)); + bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC); + bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get()); + bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get()); + + std::shared_ptr checkHostTensor( + Tensor::create(std::vector{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4)); + bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get()); + + auto backendCopyData = checkHostTensor->host(); + + for (int i = 0; i < elementSize; ++i) { + if (backendCopyData[i] != hostData[i]) { + MNN_PRINT("Error for NCHW Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); + return false; + } + } + + std::shared_ptr deviceTensor2( + Tensor::createDevice(std::vector{1, 8, 224, 224}, Tensor::TENSORFLOW)); + bn->onAcquireBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE); + bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get()); + bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get()); + for (int i = 0; i < elementSize; ++i) { + if (backendCopyData[i] != hostData[i]) { + MNN_PRINT("Error for NHWC Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); + return false; + } + } + return true; +} + +bool NC4HW4_2_NC4HW4_float(std::shared_ptr bn) { + MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n"); + std::vector nhwc_shape = {1, 224, 224, 8}; + std::vector nchw_shape = {1, 8, 224, 224}; + std::shared_ptr hostTensor( + Tensor::create(nhwc_shape, nullptr, Tensor::CAFFE_C4)); auto elementSize = hostTensor->elementSize(); auto hostData = hostTensor->host(); for (int i = 0; i < elementSize; ++i) { @@ -163,17 +216,26 @@ bool NC4HW4_2_NC4HW4_float(std::shared_ptr bn) { hostData[i] = flagRandom; } - std::shared_ptr deviceTensor_pre(Tensor::createDevice(std::vector{1, 224, 224, 8}, Tensor::CAFFE_C4)); + MNN_PRINT("\nalloc deviceTensor_pre\n"); + std::shared_ptr deviceTensor_pre(Tensor::createDevice(nhwc_shape, Tensor::CAFFE_C4)); bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC); - std::shared_ptr deviceTensor(Tensor::createDevice(std::vector{1, 224, 224, 8}, Tensor::CAFFE_C4)); + + MNN_PRINT("\nalloc deviceTensor"); + std::shared_ptr deviceTensor(Tensor::createDevice(nhwc_shape, Tensor::CAFFE_C4)); bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC); + + MNN_PRINT("\ncopy from host to deviceTensor_pre\n"); bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get()); + + MNN_PRINT("\ncopy from deviceTensor_pre to deviceTensor\n"); bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get()); + MNN_PRINT("\ncopy from deviceTensor to new host\n"); std::shared_ptr checkHostTensor( - Tensor::create(std::vector{1, 224, 224, 8}, nullptr, Tensor::CAFFE_C4)); + Tensor::create(nhwc_shape, nullptr, Tensor::CAFFE_C4)); bn->onCopyBuffer(deviceTensor.get(), checkHostTensor.get()); + auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { @@ -184,7 +246,7 @@ bool NC4HW4_2_NC4HW4_float(std::shared_ptr bn) { } std::shared_ptr deviceTensor2( - Tensor::createDevice(std::vector{1, 8, 224, 224}, Tensor::TENSORFLOW)); + Tensor::createDevice(nchw_shape, Tensor::TENSORFLOW)); bn->onAcquireBuffer(deviceTensor2.get(), Backend::DYNAMIC_SEPERATE); bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get()); bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get()); @@ -249,7 +311,7 @@ void nhwc_2_nhwc_float(std::shared_ptr bn) { auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] - hostData[i] >= 0.001f) { + if (backendCopyData[i] - hostData[i] >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); } } @@ -278,7 +340,7 @@ void nchw_2_nchw_float(std::shared_ptr bn) { auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); } } @@ -319,7 +381,7 @@ void nchw_2_NC4HW4_float(std::shared_ptr bn) { auto backendCopyData = NC4HW4_HostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); } } @@ -334,7 +396,7 @@ void nchw_2_NC4HW4_float(std::shared_ptr bn) { // MNN_PRINT("NC4HW4 -> nhwc !\n"); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) { + if (abs(backendCopyData[i] - hostData[i]) >= 0.001) { MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); } } @@ -373,6 +435,69 @@ void nchw_2_NC4HW4_2_nchw_float(std::shared_ptr bn) { } } +template +bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr bn) { + // Test NHWC -> NC4HW4 -> NHWC + MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_inttype result ! ========= \n"); + int batch = 1; + int channel = 12; + int width = 20; + int height = 20; + std::shared_ptr hostTensor( + Tensor::create(std::vector{batch, channel, height, width}, nullptr, Tensor::CAFFE)); + auto elementSize = hostTensor->elementSize(); + auto hostData = hostTensor->host(); + for (int i = 0; i < elementSize; ++i) { + hostData[i] = rand() % 255; + } + + T* temp = (T*)malloc(hostTensor->size()); + memset(temp, 0.0f, hostTensor->size()); + NCHW2NHWC(hostData, temp, batch, height, width, channel); + + std::shared_ptr deviceTensor_pre(Tensor::createDevice(std::vector{batch, height, width, channel})); + bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC); + std::shared_ptr deviceTensor(Tensor::createDevice(std::vector{batch, height, width, channel})); + bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC); + bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get()); + bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get()); + + // // nhwc -> NC4HW4 + // MNN_PRINT("nhwc -> NC4HW4 !\n"); + + MNNTensorConvertNHWCToNC4HW4(hostData, temp, height * width, channel); + std::shared_ptr NC4HW4_HostTensor( + Tensor::create(std::vector{batch, channel, height, width}, nullptr, Tensor::CAFFE_C4)); + + bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get()); + auto backendCopyData = NC4HW4_HostTensor->host(); + + for (int i = 0; i < elementSize; ++i) { + if (backendCopyData[i] != hostData[i]) { + MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); + return false; + } + } + + // NC4HW4 -> nhwc + + MNNTensorConvertNC4HW4ToNHWC(temp, hostData, height * width, channel); + + bn->onCopyBuffer(NC4HW4_HostTensor.get(), deviceTensor.get()); + NHWC2NCHW(temp, backendCopyData, batch, height, width, channel); + bn->onCopyBuffer(deviceTensor.get(), hostTensor.get()); + + // MNN_PRINT("NC4HW4 -> nhwc !\n"); + for (int i = 0; i < elementSize; ++i) { + if (backendCopyData[i] != hostData[i]) { + MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); + } + } + + free(temp); + return true; +} + bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { // Test NHWC -> NC4HW4 -> NHWC MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_float result ! ========= \n"); @@ -412,8 +537,8 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { auto backendCopyData = NC4HW4_HostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) { - MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001 + MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); return false; } } @@ -428,8 +553,8 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { // MNN_PRINT("NC4HW4 -> nhwc !\n"); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= 0.001f) { - MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { + MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); } } @@ -454,7 +579,7 @@ public: info.user = &user; std::shared_ptr runtime(creator->onCreate(info)); MNN_PRINT("Test %d Backend for %d \n", type, user.precision); - std::shared_ptr bn(runtime->onCreate()); + std::shared_ptr bn(runtime->onCreate(&user)); auto res = NC4HW4_2_NC4HW4_float(bn); res = res && nhwc_2_NC4HW4_2_nhwc_float(bn); if (!res) { @@ -467,6 +592,35 @@ public: } }; +class CPUBackendCopyBufferTest : public MNNTestCase { +public: + virtual bool run() { + auto type = MNN_FORWARD_CPU; + auto creator = MNNGetExtraRuntimeCreator(type); + for (int p = 0; p < 3; ++p) { + MNN::Backend::Info info; + info.type = type; + BackendConfig user; + user.precision = (MNN::BackendConfig::PrecisionMode)p; + info.user = &user; + std::shared_ptr runtime(creator->onCreate(info)); + MNN_PRINT("Test %d Backend for %d \n", type, user.precision); + std::shared_ptr bn(runtime->onCreate(&user)); + auto res = NC4HW4_2_NC4HW4_IntType(bn); + res = res && NC4HW4_2_NC4HW4_IntType(bn); + res = res && NC4HW4_2_NC4HW4_IntType(bn); + res = res && nhwc_2_NC4HW4_2_nhwc_inttype(bn); + res = res && nhwc_2_NC4HW4_2_nhwc_inttype(bn); + res = res && nhwc_2_NC4HW4_2_nhwc_inttype(bn); + if (!res) { + MNN_ERROR("Error for Int Copy\n"); + return false; + } + } + return true; + } +}; + class BackendCopyBufferUint8Test : public MNNTestCase { public: virtual bool run() { @@ -498,3 +652,4 @@ public: }; MNNTestSuiteRegister(BackendCopyBufferFloatTest, "engine/backend/copy_buffer_float"); //MNNTestSuiteRegister(BackendCopyBufferUint8Test, "engine/backend/copy_buffer_uint8"); +MNNTestSuiteRegister(CPUBackendCopyBufferTest, "engine/backend/copy_buffer_cpu"); diff --git a/test/core/DirectedAcyclicGraphTest.cpp b/test/core/DirectedAcyclicGraphTest.cpp deleted file mode 100644 index e00cdf1c..00000000 --- a/test/core/DirectedAcyclicGraphTest.cpp +++ /dev/null @@ -1,508 +0,0 @@ -// -// DirectedAcyclicGraphTest.cpp -// MNNTests -// -// Created by MNN on 2019/01/30. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include -#include -#include -#include "MNNTestSuite.h" -#include "core/DirectedAcyclicGraph.hpp" - -using namespace MNN; - -class OPCustom { -public: - OPCustom(string n) { - name = n; - }; - virtual ~OPCustom(){ - // MNN_PRINT("OPCustom free\n"); - }; - -public: - void setName(string n) { - name = n; - } - string getName() { - return name; - } - -private: - string name; -}; - -class OPCustomNodeDef : public NodeDef> { -public: - OPCustomNodeDef(string name) { - this->name = name; - } - -public: - void setName(string n) { - this->name = n; - } - -public: - virtual shared_ptr>> makeNode() override { - shared_ptr>> ptr = make_shared>>(); - shared_ptr op = make_shared(name); - ptr->setData(op); - return ptr; - } - -private: - string name; -}; - -static int stringCounter(const string& str, const string& sub) { - int num = 0; - for (size_t i = 0; (i = str.find(sub, i)) != string::npos; num++, i++) { - // do nothing - } - return num; -} - -static bool endsWith(const std::string& str, const std::string& suffix) { - return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -static bool startsWith(const std::string& str, const std::string& prefix) { - return str.size() >= prefix.size() && 0 == str.compare(0, prefix.size(), prefix); -} - -/* * - * input A->B->C->D expect output A->B->C->D return true - * smart pointer use_count == 2 - * */ -static void TestMemoryLeak() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - graph->AddEdge(A, B); - graph->AddEdge(B, C); - graph->AddEdge(C, D); - vector>>> order; - bool ok = graph->GetPostOrder(order); - graph.reset(); - A.reset(); - B.reset(); - C.reset(); - D.reset(); - - stringstream ss; - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->" << op.use_count() << "\t"; - } - - const string rel_str(ss.str()); - const string exp_str = "A->2\tB->2\tC->2\tD->2\t"; - const int exp_val = exp_str.compare(rel_str); - if ((exp_val != 0) || (!ok)) { - MNN_ERROR("TestMemoryLeak expect '%s,ok=1' output is %s,ok=%d\n", exp_str.c_str(), rel_str.c_str(), ok); - } -} - -/* * - * input A C->B D expect output A->C->B->D or A->D->C->B or D->A->C->B or C->B->A->D or C->B->D->A return true - * input A C->B D->B expect output A->C->D->B or C->D->B->A return true - * input C->B D->B C->A expect output C->A->D->B or D->C->A->B or D->C->B->A return true - * input C->B D->B C->A D->C expect output D->C->A->B or D->C->B->A return true - * input C->B D->B C->A D->C A->C expect return false - * */ -static void TestPostOrderSinglePoint() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - graph->AddEdge(C, B); - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - string rel_str(ss.str()); - string exp_str = "A->C->B->D->"; - string exp_str2 = "A->D->C->B->"; - string exp_str3 = "D->A->C->B->"; - string exp_str4 = "C->B->D->A->"; - string exp_str5 = "C->B->A->D->"; - int exp_val = exp_str.compare(rel_str); - if (0 != exp_val) { - exp_val = exp_str2.compare(rel_str); - } - if (0 != exp_val) { - exp_val = exp_str3.compare(rel_str); - } - if (0 != exp_val) { - exp_val = exp_str4.compare(rel_str); - } - if (0 != exp_val) { - exp_val = exp_str5.compare(rel_str); - } - if ((!ok) || (0 != exp_val)) { - MNN_ERROR("TestPostOrderSinglePoint expect 'A->C->B->D,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok); - } - - graph->AddEdge(D, B); - ok = graph->GetPostOrder(order); - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - rel_str = ss.str(); - exp_str = "A->D->C->B->"; - exp_str2 = "A->C->D->B->"; - exp_str3 = "C->D->B->A->"; - exp_val = exp_str.compare(rel_str); - if (0 != exp_val) { - exp_val = exp_str2.compare(rel_str); - } - if (0 != exp_val) { - exp_val = exp_str3.compare(rel_str); - } - if ((!ok) || (0 != exp_val)) { - MNN_ERROR("TestPostOrderSinglePoint expect 'A->C->D->B or A->D->C->B,ok=1' output is %s,ok=%d\n", - rel_str.c_str(), ok); - } - - graph->AddEdge(C, A); - ok = graph->GetPostOrder(order); - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - rel_str = ss.str(); - exp_str = "C->A->D->B->"; - exp_str2 = "D->C->A->B->"; - exp_str3 = "D->C->B->A->"; - exp_val = exp_str.compare(rel_str); - if (0 != exp_val) { - exp_val = exp_str2.compare(rel_str); - } - if (0 != exp_val) { - exp_val = exp_str3.compare(rel_str); - } - if ((!ok) || (0 != exp_val)) { - MNN_ERROR("TestPostOrderSinglePoint expect 'C->A->D->B,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok); - } - - graph->AddEdge(D, C); - ok = graph->GetPostOrder(order); - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - rel_str = ss.str(); - exp_str = "D->C->A->B->"; - exp_str2 = "D->C->B->A->"; - exp_val = exp_str.compare(rel_str); - if (0 != exp_val) { - exp_val = exp_str2.compare(rel_str); - } - if ((!ok) || (0 != exp_val)) { - MNN_ERROR("TestPostOrderSinglePoint expect 'D->C->A->B or D->C->B->A,ok=1' output is %s,ok=%d\n", - rel_str.c_str(), ok); - } - - /*cycle*/ - graph->AddEdge(A, C); - - ok = graph->GetPostOrder(order); - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - if (false != ok) { - MNN_ERROR("TestPostOrderSinglePoint cycle expect 'ok=0' output is %s,ok=%d\n", ss.str().c_str(), ok); - } -} - -/* * - * input A->B->C->D expect output A->B->C->D return true - * input A->B->C->D->A expect return false - * */ -static void TestPostOrder() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - graph->AddEdge(A, B); - graph->AddEdge(B, C); - graph->AddEdge(C, D); - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - const string rel_str(ss.str()); - const string exp_str = "A->B->C->D->"; - const int exp_val = exp_str.compare(rel_str); - if ((!ok) || (0 != exp_val)) { - MNN_ERROR("TestPostOrder expect 'A->B->C->D,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok); - } - - /*cycle*/ - graph->AddEdge(D, B); - ok = graph->GetPostOrder(order); - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - if (false != ok) { - MNN_ERROR("TestPostOrder cycle expect 'ok=0' output is %s,ok=%d\n", ss.str().c_str(), ok); - } -} - -/* * - * input A->B->C->D expect output A->B->C->D return true - * input A->B->C->D->A expect return false - * */ -static void TestPostOrderDiffInputs() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - graph->AddEdge(C, D); - graph->AddEdge(B, C); - graph->AddEdge(A, B); - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - const string rel_str(ss.str()); - const string exp_str = "A->B->C->D->"; - const int exp_val = exp_str.compare(rel_str); - if ((!ok) || (0 != exp_val)) { - MNN_ERROR("TestPostOrderDiffInputs expect 'A->B->C->D,ok=1' output is %s,ok=%d\n", rel_str.c_str(), ok); - } - - /*cycle*/ - graph->AddEdge(D, B); - ok = graph->GetPostOrder(order); - ss.str(""); - ss.clear(); - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - if (false != ok) { - MNN_ERROR("TestPostOrderDiffInputs cycle expect 'ok=0' output is %s,ok=%d\n", ss.str().c_str(), ok); - } -} - -/* * - * input A B C D expect return true do'nt care order,only contain A B C D - * */ -static void TestPostOrderAllSingle() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - const string rel_str(ss.str()); - const string exp_str1 = "A->"; - const string exp_str2 = "B->"; - const string exp_str3 = "C->"; - const string exp_str4 = "D->"; - const int exp_val1 = stringCounter(rel_str, exp_str1); - const int exp_val2 = stringCounter(rel_str, exp_str2); - const int exp_val3 = stringCounter(rel_str, exp_str3); - const int exp_val4 = stringCounter(rel_str, exp_str4); - const int exp_len = (int)(exp_str1.length() + exp_str2.length() + exp_str3.length() + exp_str4.length()); - if ((exp_val1 != 1) || (exp_val2 != 1) || (exp_val3 != 1) || (exp_val4 != 1) || (!ok) || - (rel_str.length() != exp_len)) { - MNN_ERROR("TestPostOrderAllSingle expect only contain 'A B C D,ok=1' ignore order output is %s,ok=%d\n", - rel_str.c_str(), ok); - } -} - -/* * - * input A->B A->C A->D expect return true and A is first - * */ -static void TestPostOrderAllFromOne() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - - graph->AddEdge(A, D); - graph->AddEdge(A, C); - graph->AddEdge(A, B); - - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - const string rel_str(ss.str()); - const string exp_str1 = "A->"; - const string exp_str2 = "B->"; - const string exp_str3 = "C->"; - const string exp_str4 = "D->"; - const int exp_val1 = stringCounter(rel_str, exp_str1); - const int exp_val2 = stringCounter(rel_str, exp_str2); - const int exp_val3 = stringCounter(rel_str, exp_str3); - const int exp_val4 = stringCounter(rel_str, exp_str4); - const int exp_len = (int)(exp_str1.length() + exp_str2.length() + exp_str3.length() + exp_str4.length()); - const bool exp_val = startsWith(rel_str, exp_str1); - - if ((exp_val1 != 1) || (exp_val2 != 1) || (exp_val3 != 1) || (exp_val4 != 1) || (!ok) || - (rel_str.length() != exp_len) || (!exp_val)) { - MNN_ERROR("TestPostOrderAllFromOne expect A is first output is %s,ok=%d\n", rel_str.c_str(), ok); - } -} - -/* * - * input B->A C->A D->A expect return true and A is last - * */ -static void TestPostOrderAllToOne() { - OPCustomNodeDef def("A"); - unique_ptr>> graph(new DirectedAcyclicGraph>()); - shared_ptr>> A = graph->AddNode(def); - def.setName("B"); - shared_ptr>> B = graph->AddNode(def); - def.setName("C"); - shared_ptr>> C = graph->AddNode(def); - def.setName("D"); - shared_ptr>> D = graph->AddNode(def); - - graph->AddEdge(D, A); - graph->AddEdge(C, A); - graph->AddEdge(B, A); - - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - const string rel_str(ss.str()); - const string exp_str1 = "A->"; - const string exp_str2 = "B->"; - const string exp_str3 = "C->"; - const string exp_str4 = "D->"; - const int exp_val1 = stringCounter(rel_str, exp_str1); - const int exp_val2 = stringCounter(rel_str, exp_str2); - const int exp_val3 = stringCounter(rel_str, exp_str3); - const int exp_val4 = stringCounter(rel_str, exp_str4); - const int exp_len = (int)(exp_str1.length() + exp_str2.length() + exp_str3.length() + exp_str4.length()); - const bool exp_val = endsWith(rel_str, exp_str1); - if ((exp_val1 != 1) || (exp_val2 != 1) || (exp_val3 != 1) || (exp_val4 != 1) || (!ok) || - (rel_str.length() != exp_len) || (!exp_val)) { - MNN_ERROR("TestPostOrderAllToOne expect A is last output is %s,ok=%d\n", rel_str.c_str(), ok); - } -} - -/* * - * expect return true - * */ -static void TestPostOrderEmpty() { - unique_ptr>> graph(new DirectedAcyclicGraph>()); - vector>>> order; - bool ok = graph->GetPostOrder(order); - stringstream ss; - for (shared_ptr>> op : order) { - string name = op->getData()->getName(); - ss << name << "->"; - } - - const string rel_str(ss.str()); - if ((!ok) || (rel_str.length() != 0)) { - MNN_ERROR("TestPostOrderEmpty expect 'ok=1',%s output is ok=%d\n", rel_str.c_str(), ok); - } -} - -class DirectedAcyclicGraphTest : public MNNTestCase { -public: - virtual bool run(); - DirectedAcyclicGraphTest() { - } - virtual ~DirectedAcyclicGraphTest() { - } -}; - -bool DirectedAcyclicGraphTest::run() { - TestPostOrder(); - TestPostOrderSinglePoint(); - TestMemoryLeak(); - TestPostOrderDiffInputs(); - TestPostOrderAllSingle(); - TestPostOrderAllFromOne(); - TestPostOrderAllToOne(); - TestPostOrderEmpty(); - return true; -} - -MNNTestSuiteRegister(DirectedAcyclicGraphTest, "engine/DirectedAcyclicGraph"); diff --git a/test/core/RegionFuse.cpp b/test/core/RegionFuse.cpp index fb015d54..ba57fbb2 100644 --- a/test/core/RegionFuse.cpp +++ b/test/core/RegionFuse.cpp @@ -17,7 +17,7 @@ public: using Region = Tensor::InsideDescribe::Region; virtual ~RegionFuseTest() = default; virtual bool run() { - constexpr int N = 10; + constexpr int N = 11; // [src_offset, src_stride_0_1_2, dst_offset, dst_stride_0_1_2, size_0_1_2] int data[N*3][11] = { // 2D-transpose + 2D-transpose = memcpy: [1, 4, 16] => [1, 16, 4] => [1, 4, 16] @@ -59,6 +59,10 @@ public: // transpose + slice (dont align, not full copy) {0, 1600, 1, 4, 0, 1600, 400, 1, 53, 4, 400}, {0, 400, 20, 1, 0, 400, 20, 1, 190, 20, 20}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + // pad + transpose + slice + transpose (not full copy) + {0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111}, + {113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111}, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1} }; for (int i = 0; i < N; i++) { @@ -71,6 +75,7 @@ public: } int cmp = ::memcmp(&dst, data[3 * i + 2], 44); if (!fused || (cmp != 0)) { + MNN_ERROR("regionfuse %d test failed!\n", i); return false; } } diff --git a/test/expr/MemoryIncrease.cpp b/test/expr/MemoryIncrease.cpp index d9210d93..595805f4 100644 --- a/test/expr/MemoryIncrease.cpp +++ b/test/expr/MemoryIncrease.cpp @@ -101,8 +101,27 @@ public: virtual bool run() { auto x = _Input({1, 3, 224, 224}, NCHW, halide_type_of()); auto y = _Interp({x}, 0.25, 0.25, 56, 56, 2, true); + y = _Convert(y, NCHW); + auto size = y->getInfo()->size; + int e = 14; + y = _Reshape(y, {e, -1}); + auto l = size / e; + VARP res; + { + std::unique_ptr mat(new OpT); + mat->type = OpType_MatMul; + mat->main.type = OpParameter_MatMul; + mat->main.value = new MatMulT; + mat->main.AsMatMul()->transposeA = false; + mat->main.AsMatMul()->transposeB = false; + + std::vector bias(e, 0.0f); + auto biasVar = _Const(bias.data(), {e}, NCHW, halide_type_of()); + auto weightVar = _Input({l, 50}, NCHW, halide_type_of()); + res = Variable::create(Expr::create(mat.get(), {y, weightVar, biasVar})); + } std::unique_ptr net(new NetT); - Variable::save({y}, net.get()); + Variable::save({res}, net.get()); flatbuffers::FlatBufferBuilder builderOutput(1024); auto len = MNN::Net::Pack(builderOutput, net.get()); builderOutput.Finish(len); diff --git a/test/main.cpp b/test/main.cpp index b1d506ef..71247ce6 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -15,6 +15,12 @@ #include "MNNTestSuite.h" int main(int argc, char* argv[]) { + if (argc == 2 && strcmp(argv[1], "--help") == 0) { + MNN_PRINT("./run_test.out [test_name] [backend] [precision]\n"); + MNN_PRINT("\t backend: 0 - CPU (default), 3 - OpenCL\n"); + MNN_PRINT("\t precision: 0 - Normal, 1 - High (default), 2 - Low\n"); + return 0; + } if (argc > 2) { auto type = (MNNForwardType)atoi(argv[2]); FUNC_PRINT(type); diff --git a/test/model/MobileNetTest.cpp b/test/model/MobileNetTest.cpp index 647236fc..bd093d4e 100644 --- a/test/model/MobileNetTest.cpp +++ b/test/model/MobileNetTest.cpp @@ -177,3 +177,45 @@ MNNTestSuiteRegister(MobileNetV1Test, "model/mobilenet/1/caffe"); MNNTestSuiteRegister(MobileNetV2Test, "model/mobilenet/2/caffe"); MNNTestSuiteRegister(MobileNetV2TFLiteTest, "model/mobilenet/2/tflite"); MNNTestSuiteRegister(MobileNetV2TFLiteQntTest, "model/mobilenet/2/tflite_qnt"); + + +class ModelTest : public MNNTestCase { +public: + virtual ~ModelTest() = default; + + std::string root() { +#ifdef __APPLE__ + auto bundle = CFBundleGetMainBundle(); + auto url = CFBundleCopyBundleURL(bundle); + auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle); + CFRelease(url); + auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8); + auto res = std::string(cstring); + CFRelease(string); + return res; +#else + return "../resource"; // assume run in build dir +#endif + } + + std::string path() { + return this->root() + "/model/temp.bin"; + } + + virtual bool run() { + auto net = MNN::Interpreter::createFromFile(this->path().c_str()); + if (NULL == net) { + return false; + } + ScheduleConfig cpuconfig; + cpuconfig.type = MNN_FORWARD_CPU; + BackendConfig bnConfig; + bnConfig.precision = BackendConfig::Precision_Low; + cpuconfig.backendConfig = &bnConfig; + auto session = net->createSession(cpuconfig); + net->runSession(session); + delete net; + return true; + } +}; +MNNTestSuiteRegister(ModelTest, "model/model_test"); diff --git a/test/op/BinaryOPTest.cpp b/test/op/BinaryOPTest.cpp index 3761907e..5496980a 100644 --- a/test/op/BinaryOPTest.cpp +++ b/test/op/BinaryOPTest.cpp @@ -12,702 +12,286 @@ #include "TestUtils.h" using namespace MNN::Express; +using namespace std; -class BinaryBroadcastShapeTest : public MNNTestCase { -public: - virtual ~BinaryBroadcastShapeTest() = default; - virtual bool run() { - auto input_x = _Const(1, {4, 1, 2, 1}, NCHW); - auto input_y = _Const(1, {2, 1, 4}, NCHW); +class BinaryTestCommon : public MNNTestCase { +protected: + template + bool test(VARP (*opFunc)(VARP, VARP), string name, Tout threshold, + const vector& data_x, const vector& data_y, const vector& data_out, + const vector& shape_x, const vector& shape_y, const vector& shape_out) { + int size_x = 1, size_y = 1, size_out = 1; + for (int i = 0; i < shape_x.size(); ++i) { + size_x *= shape_x[i]; + } + for (int i = 0; i < shape_y.size(); ++i) { + size_y *= shape_y[i]; + } + for (int i = 0; i < shape_y.size(); ++i) { + size_out *= shape_out[i]; + } + + auto input_x = _Input(shape_x, NCHW, halide_type_of()); + auto input_y = _Input(shape_y, NCHW, halide_type_of()); input_x->setName("input_x"); input_y->setName("input_y"); - auto output = _Add(input_x, input_y); - const std::vector expectedOutputShape = {4, 2, 2, 4}; - auto outputSize = output->getInfo()->dim.size(); - if (outputSize != expectedOutputShape.size()) { - MNN_ERROR("BinaryBroadcastShapeTest shape compute error!\n"); + // set input data + auto ptr_x = input_x->template writeMap(); + auto ptr_y = input_y->template writeMap(); + memcpy(ptr_x, data_x.data(), size_x * sizeof(Tin)); + memcpy(ptr_y, data_y.data(), size_y * sizeof(Tin)); + input_x->unMap(); + input_y->unMap(); + auto output = opFunc(input_x, input_y); + auto gotOutput = output->template readMap(); + + auto shape_got = output->getInfo()->dim; + if (shape_got.size() != shape_out.size()) { + MNN_ERROR("%s shape compute error!\n", name.c_str()); return false; } - for (int i = 0; i < outputSize; i++) { - if (output->getInfo()->dim[i] != expectedOutputShape[i]) { - MNN_ERROR("BinaryBroadcastShapeTest shape compute error!\n"); + for (int i = 0; i < shape_got.size(); i++) { + if (shape_got[i] != shape_out[i]) { + MNN_ERROR("%s shape compute error!\n", name.c_str()); return false; } } - const std::vector expectedOutput = {2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., - 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., - 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., - 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.}; - auto outputPtr = output->readMap(); - if (!checkVector(outputPtr, expectedOutput.data(), outputSize, 1e-6)) { - MNN_ERROR("BinaryBroadcastShapeTest compute error!\n"); + + if (!checkVector(gotOutput, data_out.data(), size_out, threshold)) { + MNN_ERROR("%s test failed!\n", name.c_str()); return false; } return true; } }; -class AddTest : public MNNTestCase { +class AddTest : public BinaryTestCommon { public: virtual ~AddTest() = default; virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {1.0, 2.0, 3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Add(input_x, input_y); - const std::vector expectedOutput = {0.0, 0.0, 0.0, 0.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AddTest test failed!\n"); - return false; - } - return true; - } -}; -class SubtractTest : public MNNTestCase { -public: - virtual ~SubtractTest() = default; - virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {1.0, 2.0, 3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Subtract(input_x, input_y); - const std::vector expectedOutput = {-2.0, -4.0, -6.0, -8.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SubtractTest test failed!\n"); - return false; - } - return true; - } -}; -class MultiplyTest : public MNNTestCase { -public: - virtual ~MultiplyTest() = default; - virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {1.0, 2.0, 3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Multiply(input_x, input_y); - const std::vector expectedOutput = {-1.0, -4.0, -9.0, -16.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("MultiplyTest test failed!\n"); - return false; - } - return true; - } -}; -class DivideTest : public MNNTestCase { -public: - virtual ~DivideTest() = default; - virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {2.0, 4.0, 6.0, 8.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Divide(input_x, input_y); - const std::vector expectedOutput = {-0.5, -0.5, -0.5, -0.5}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("DivideTest test failed!\n"); - return false; - } - return true; - } -}; -class PowTest : public MNNTestCase { -public: - virtual ~PowTest() = default; - virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {2.0, 4.0, 6.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Pow(input_x, input_y); - const std::vector expectedOutput = {1.0, 16.0, 729.0, 256.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("PowTest test failed!\n"); - return false; - } - return true; - } -}; -class MinimumTest : public MNNTestCase { -public: - virtual ~MinimumTest() = default; - virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {2.0, 4.0, 6.0, 8.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Minimum(input_x, input_y); - const std::vector expectedOutput = {-1.0, -2.0, -3.0, -4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("MinimumTest test failed!\n"); - return false; - } - return true; - } -}; -class MaximumTest : public MNNTestCase { -public: - virtual ~MaximumTest() = default; - virtual bool run() { - auto input_x = _Input( - { - 4, - }, - NCHW); - auto input_y = _Input( - { - 4, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0}; - const float data_y[] = {2.0, 4.0, 6.0, 8.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 4 * sizeof(float)); - memcpy(ptr_y, data_y, 4 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Maximum(input_x, input_y); - const std::vector expectedOutput = {2.0, 4.0, 6.0, 8.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("MaximumTest test failed!\n"); - return false; - } - return true; - } -}; -class BiasAddTest : public MNNTestCase { -public: - virtual ~BiasAddTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0}; - const float data_y[] = {1.0, 2.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _BiasAdd(input_x, input_y); - const std::vector expectedOutput = {0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0.01)) { - MNN_ERROR("BiasAddTest test failed!\n"); - return false; - } - return true; - } -}; -class GreaterTest : public MNNTestCase { -public: - virtual ~GreaterTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Greater(input_x, input_y); - const std::vector expectedOutput = {0, 0, 0, 0, 1, 1, 1, 1}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("GreaterTest test failed!\n"); - return false; - } - return true; - } -}; -class GreaterEqualTest : public MNNTestCase { -public: - virtual ~GreaterEqualTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _GreaterEqual(input_x, input_y); - const std::vector expectedOutput = {0, 0, 1, 1, 1, 1, 1, 1}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("GreaterEqualTest test failed!\n"); - return false; - } - return true; - } -}; -class LessTest : public MNNTestCase { -public: - virtual ~LessTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Less(input_x, input_y); - const std::vector expectedOutput = {1, 1, 0, 0, 0, 0, 0, 0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("LessTest test failed!\n"); - return false; - } - return true; - } -}; -class FloorDivTest : public MNNTestCase { -public: - virtual ~FloorDivTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _FloorDiv(input_x, input_y); - const std::vector expectedOutput = {-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0.01)) { - MNN_ERROR("FloorDivTest test failed!\n"); - for (int i = 0; i < expectedOutput.size(); ++i) { - printf("%f - %f\n", expectedOutput[i], gotOutput[i]); - } - return false; - } - return true; - } -}; -class SquaredDifferenceTest : public MNNTestCase { -public: - virtual ~SquaredDifferenceTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _SquaredDifference(input_x, input_y); - const std::vector expectedOutput = {16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0.01)) { - MNN_ERROR("SquaredDifferenceTest test failed!\n"); - return false; - } - return true; - } -}; -class EqualTest : public MNNTestCase { -public: - virtual ~EqualTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Equal(input_x, input_y); - const std::vector expectedOutput = {0, 0, 1, 1, 0, 0, 0, 0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("EqualTest test failed!\n"); - return false; - } - return true; - } -}; -class LessEqualTest : public MNNTestCase { -public: - virtual ~LessEqualTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _LessEqual(input_x, input_y); - const std::vector expectedOutput = {1, 1, 1, 1, 0, 0, 0, 0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("LessEqualTest test failed!\n"); - return false; - } - return true; - } -}; -class FloorModTest : public MNNTestCase { -public: - virtual ~FloorModTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.00001f}; - const float data_y[] = {3.0f, 4.0f}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _FloorMod(input_x, input_y); - const std::vector expectedOutput = {2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.0f}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0.01)) { - MNN_ERROR("FloorMod test failed!\n"); - for (int i = 0; i < expectedOutput.size(); ++i) { - printf("%f - %f\n", expectedOutput[i], gotOutput[i]); - } - return false; - } - return true; - } -}; -class Atan2Test : public MNNTestCase { -public: - virtual ~Atan2Test() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW); - auto input_y = _Input( - { - 2, - }, - NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const float data_x[] = {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0}; - const float data_y[] = {3.0, 4.0}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(float)); - memcpy(ptr_y, data_y, 2 * sizeof(float)); - input_x->unMap(); - input_y->unMap(); - auto output = _Atan2(input_x, input_y); - const std::vector expectedOutput = {-0.32175055, -0.4636476, -0.7853982, -0.7853982, - 1.0303768, 0.98279375, 1.1659045, 1.1071488}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0.01)) { - MNN_ERROR("Atan2Test test failed!\n"); - return false; - } - return true; - } -}; -class LogicalOrTest : public MNNTestCase { -public: - virtual ~LogicalOrTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW, halide_type_of()); - auto input_y = _Input( - { - 2, - }, - NCHW, halide_type_of()); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const int data_x[] = {true, false, true, false, false, true, true, false}; - const int data_y[] = {true, false}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(int)); - memcpy(ptr_y, data_y, 2 * sizeof(int)); - input_x->unMap(); - input_y->unMap(); - auto output = _LogicalOr(input_x, input_y); - const std::vector expectedOutput = {true, false, true, false, true, true, true, false}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("LogicalOrTest test failed!\n"); - return false; - } - return true; - } -}; -class NotEqualTest : public MNNTestCase { -public: - virtual ~NotEqualTest() = default; - virtual bool run() { - auto input_x = _Input({4, 2}, NCHW, halide_type_of()); - auto input_y = _Input( - { - 2, - }, - NCHW, halide_type_of()); - input_x->setName("input_x"); - input_y->setName("input_y"); - // set input data - const int data_x[] = {true, false, true, false, false, true, true, false}; - const int data_y[] = {true, false}; - auto ptr_x = input_x->writeMap(); - auto ptr_y = input_y->writeMap(); - memcpy(ptr_x, data_x, 8 * sizeof(int)); - memcpy(ptr_y, data_y, 2 * sizeof(int)); - input_x->unMap(); - input_y->unMap(); - auto output = _NotEqual(input_x, input_y); - const std::vector expectedOutput = {false, false, false, false, true, true, false, false}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 8, 0)) { - MNN_ERROR("NotEqualTest test failed!\n"); - return false; - } - return true; + return test(_Add, "AddTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {0.0, 0.0, 0.0, 0.0}, + {4}, {4}, {4}); } }; -class SubtractBroastTest : public MNNTestCase { +class SubtractTest : public BinaryTestCommon { +public: + virtual ~SubtractTest() = default; + virtual bool run() { + return test(_Subtract, "SubtractTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-2.0, -4.0, -6.0, -8.0}, + {4}, {4}, {4}); + } +}; +class MultiplyTest : public BinaryTestCommon { +public: + virtual ~MultiplyTest() = default; + virtual bool run() { + return test(_Multiply, "MultiplyTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -4.0, -9.0, -16.0}, + {4}, {4}, {4}); + } +}; +class DivideTest : public BinaryTestCommon { +public: + virtual ~DivideTest() = default; + virtual bool run() { + return test(_Divide, "DivideTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {-0.5, -0.5, -0.5, -0.5}, + {4}, {4}, {4}); + } +}; +class PowTest : public BinaryTestCommon { +public: + virtual ~PowTest() = default; + virtual bool run() { + return test(_Pow, "PowTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 4.0}, {1.0, 16.0, 729.0, 256.0}, + {4}, {4}, {4}); + } +}; +class MinimumTest : public BinaryTestCommon { +public: + virtual ~MinimumTest() = default; + virtual bool run() { + return test(_Minimum, "MinimumTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {1.0, 2.0, 3.0, 4.0}, {-1.0, -2.0, -3.0, -4.0}, + {4}, {4}, {4}); + } +}; +class MaximumTest : public BinaryTestCommon { +public: + virtual ~MaximumTest() = default; + virtual bool run() { + return test(_Maximum, "MaximumTest", 0.01, + {-1.0, -2.0, -3.0, -4.0}, {2.0, 4.0, 6.0, 8.0}, {2.0, 4.0, 6.0, 8.0}, + {4}, {4}, {4}); + } +}; +class BiasAddTest : public BinaryTestCommon { +public: + virtual ~BiasAddTest() = default; + virtual bool run() { + return test(_BiasAdd, "BiasAddTest", 0.01, + {-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0}, + {1.0, 2.0}, + {0.0, 0.0, -2.0, -2.0, -4.0, -4.0, -6.0, -6.0}, + {4, 2}, {2}, {4, 2}); + } +}; +class GreaterTest : public BinaryTestCommon { +public: + virtual ~GreaterTest() = default; + virtual bool run() { + return test(_Greater, "GreaterTest", 0, + {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + {3.0, 4.0}, + {0, 0, 0, 0, 1, 1, 1, 1}, + {4, 2}, {2}, {4, 2}); + } +}; +class GreaterEqualTest : public BinaryTestCommon { +public: + virtual ~GreaterEqualTest() = default; + virtual bool run() { + return test(_GreaterEqual, "GreaterEqualTest", 0, + {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + {3.0, 4.0}, + {0, 0, 1, 1, 1, 1, 1, 1}, + {4, 2}, {2}, {4, 2}); + } +}; +class LessTest : public BinaryTestCommon { +public: + virtual ~LessTest() = default; + virtual bool run() { + return test(_Less, "LessTest", 0, + {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + {3.0, 4.0}, + {1, 1, 0, 0, 0, 0, 0, 0}, + {4, 2}, {2}, {4, 2}); + } +}; +class FloorDivTest : public BinaryTestCommon { +public: + virtual ~FloorDivTest() = default; + virtual bool run() { + return test(_FloorDiv, "FloorDivTest", 0.01, + {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.1}, + {3.0, 4.0}, + {-1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 2.0, 2.0}, + {4, 2}, {2}, {4, 2}); + } +}; +class SquaredDifferenceTest : public BinaryTestCommon { +public: + virtual ~SquaredDifferenceTest() = default; + virtual bool run() { + return test(_SquaredDifference, "SquaredDifferenceTest", 0.01, + {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.001}, + {3.0, 4.0}, + {16.0, 36.0, 36.0, 64.0, 4.0, 4.0, 16.0, 16.0}, + {4, 2}, {2}, {4, 2}); + } +}; +class EqualTest : public BinaryTestCommon { +public: + virtual ~EqualTest() = default; + virtual bool run() { + return test(_Equal, "EqualTest", 0, + {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + {3.0, 4.0}, + {0, 0, 1, 1, 0, 0, 0, 0}, + {4, 2}, {2}, {4, 2}); + } +}; +class LessEqualTest : public BinaryTestCommon { +public: + virtual ~LessEqualTest() = default; + virtual bool run() { + return test(_LessEqual, "LessEqualTest", 0, + {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + {3.0, 4.0}, + {1, 1, 1, 1, 0, 0, 0, 0}, + {4, 2}, {2}, {4, 2}); + } +}; +class FloorModTest : public BinaryTestCommon { +public: + virtual ~FloorModTest() = default; + virtual bool run() { + return test(_FloorMod, "FloorModTest", 0.01, + {-1.0f, -2.0f, -3.0f, -4.0f, 5.0f, 6.0f, 7.0f, 8.1f}, + {3.0f, 4.0f}, + {2.0f, 2.0f, 0.0f, 0.0f, 2.0f, 2.0f, 1.0f, 0.1f}, + {4, 2}, {2}, {4, 2}); + } +}; +class Atan2Test : public BinaryTestCommon { +public: + virtual ~Atan2Test() = default; + virtual bool run() { + return test(_Atan2, "Atan2Test", 0.01, + {-1.0, -2.0, -3.0, -4.0, 5.0, 6.0, 7.0, 8.0}, + {3.0, 4.0}, + {-0.32175055, -0.4636476, -0.7853982, -0.7853982, 1.0303768, 0.98279375, 1.1659045, 1.1071488}, + {4, 2}, {2}, {4, 2}); + } +}; +class LogicalOrTest : public BinaryTestCommon { +public: + virtual ~LogicalOrTest() = default; + virtual bool run() { + return test(_LogicalOr, "LogicalOrTest", 0, + {true, false, true, false, false, true, true, false}, + {true, false}, + {true, false, true, false, true, true, true, false}, + {4, 2}, {2}, {4, 2}); + } +}; +class NotEqualTest : public BinaryTestCommon { +public: + virtual ~NotEqualTest() = default; + virtual bool run() { + return test(_NotEqual, "NotEqualTest", 0, + {true, false, true, false, false, true, true, false}, + {true, false}, + {false, false, false, false, true, true, false, false}, + {4, 2}, {2}, {4, 2}); + } +}; + +class BinaryBroadcastShapeTest : public BinaryTestCommon { +public: + virtual ~BinaryBroadcastShapeTest() = default; + virtual bool run() { + vector data_x(8, 1), data_y(8, 1), data_out(64, 2); + vector shape_x = {4, 1, 2, 1}, shape_y = {2, 1, 4}, shape_out = {4, 2, 2, 4}; + return test(_Add, "BinaryBroadcastShapeTest", 0, + data_x, data_y, data_out, shape_x, shape_y, shape_out); + } +}; + +class SubtractBroastTest : public BinaryTestCommon { public: virtual ~SubtractBroastTest() = default; virtual bool run() { - auto input_x = _Input({560}, NCHW); - auto input_y = _Input({1, 20, 560}, NCHW); - input_x->setName("input_x"); - input_y->setName("input_y"); - std::vector x0T(560); - std::vector x1T(560 * 20); - auto x0 = input_x->writeMap(); - auto x1 = input_y->writeMap(); + vector data_x(560), data_y(20 * 560), data_out(20 * 560); + vector shape_x = {560}, shape_y = {1, 20, 560}, shape_out = {1, 20, 560}; for (int i = 0; i < 560; ++i) { - x0[i] = i / 1000.0f; - x0T[i] = x0[i]; + data_x[i] = i / 1000.0f; } for (int i = 0; i < 560 * 20; ++i) { - x1[i] = i / 1000.0f; - x1T[i] = x1[i]; + data_y[i] = i / 1000.0f; } - auto output = _Subtract(input_x, input_y); - auto ptr = output->readMap(); for (int i = 0; i < 20; ++i) { for (int j = 0; j < 560; ++j) { - auto x0V = x0T[j]; - auto x1V = x1T[j + i * 560]; - auto y1V = ptr[j + i * 560]; - auto target = x0V - x1V; - if (fabsf(target - y1V) > 0.01f) { - MNN_ERROR("SubtractTest broascast test failed: i:%d, j:%d, Right: %f - Compute: %f!\n", i, j, y1V, - target); - return false; - } + data_out[j + i * 560] = data_x[j] - data_y[j + i * 560]; } } - return true; + return test(_Subtract, "SubtractBroastTest", 0.01, + data_x, data_y, data_out, shape_x, shape_y, shape_out); } }; diff --git a/test/op/Convolution3DTest.cpp b/test/op/Convolution3DTest.cpp index 1a0e3a31..b7021a0c 100644 --- a/test/op/Convolution3DTest.cpp +++ b/test/op/Convolution3DTest.cpp @@ -148,7 +148,7 @@ protected: ::memcpy(input->writeMap(), inputData.data(), inputData.size() * sizeof(float)); // difference below 0.5% relative error is considered correct. - if (!checkVectorByRelativeError(output->readMap(), outputData.data(), outputData.size(), 0.005)) { + if (!checkVectorByRelativeError(output->readMap(), outputData.data(), outputData.size(), 0.05)) { MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str()); return false; } diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp index 5973a138..e60ef52d 100644 --- a/test/op/ConvolutionTest.cpp +++ b/test/op/ConvolutionTest.cpp @@ -78,7 +78,7 @@ public: protected: static bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, int batch, int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride, - int dilation, int group) { + int dilation, int group, bool debug = false) { using namespace MNN::Express; std::map padMap = { {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}}; @@ -101,6 +101,23 @@ protected: auto floatData = (float)(data % 255) / 255.0f; inputData.push_back(floatData); } + if (debug) { + MNN_PRINT("inputData:\n["); + for (int i = 0; i < inputData.size(); ++i) { + MNN_PRINT("%f ", inputData[i]); + } + MNN_PRINT("]\n"); + MNN_PRINT("weightData:\n["); + for (int i = 0; i < weightData.size(); ++i) { + MNN_PRINT("%f ", weightData[i]); + } + MNN_PRINT("]\n"); + MNN_PRINT("biasData:\n["); + for (int i = 0; i < biasData.size(); ++i) { + MNN_PRINT("%f ", biasData[i]); + } + MNN_PRINT("]\n"); + } reference_conv2d(inputData, weightData, biasData, outputData, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw, stride, dilation, group); auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of()); @@ -280,9 +297,10 @@ protected: continue; for (int s = 1; s <= 2; s++) { for (int p = 0; p <= 1; p++) { + bool debug = false; bool succ = ConvolutionCommonTest::test( type, device_name, "GroupConv2D", b, ic, oc, is, is, PadMode_CAFFE, - p, p, kh, kw, s, d, g); + p, p, kh, kw, s, d, g, debug); if (!succ) { return false; } diff --git a/test/op/DeconvolutionTest.cpp b/test/op/DeconvolutionTest.cpp index 2b9ec672..f6699077 100644 --- a/test/op/DeconvolutionTest.cpp +++ b/test/op/DeconvolutionTest.cpp @@ -12,15 +12,43 @@ #include #include "MNNTestSuite.h" #include "TestUtils.h" +using namespace std; +using namespace MNN; using namespace MNN::Express; -class DeconvolutionTest : public MNNTestCase { + +class DeconvolutionCommonTest : public MNNTestCase { +public: + virtual ~DeconvolutionCommonTest() = default; + +protected: + static bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, + vector& inputData, vector& weightData, vector& biasData, vector& rightOutData, + int batch, int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, + int kw, int stride, int dilation, int group) { + std::map padMap = { + {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}}; + auto input = _Input({batch, ic, ih, iw}, NCHW, halide_type_of()); + ::memcpy(input->writeMap(), inputData.data(), inputData.size() * sizeof(float)); + auto output = _Deconv(std::move(weightData), std::move(biasData), input, {ic, oc}, {kw, kh}, padMap[mode], + {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false); + + // difference below 0.5% relative error is considered correct. + auto outputPtr = output->readMap(); + if (!checkVectorByRelativeError(outputPtr, rightOutData.data(), rightOutData.size(), 0.005)) { + MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str()); + return false; + } + return true; + } +}; + +class DeconvolutionTest : public DeconvolutionCommonTest { public: virtual ~DeconvolutionTest() = default; virtual bool run() { MNN_PRINT("beigin testcase 0\n"); + { - auto input = _Input({1, 3, 2, 2}, NCHW, halide_type_of()); - std::vector data_a = {// channel 0 1.0, 2.0, 4.0, 5.0, // channel 1 @@ -31,67 +59,19 @@ public: std::vector weight = { // output channel0 // input channel0 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // input channel1 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, // input channel2 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // output channel1 // input channel0 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, // input channel1 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // input channel2 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, }; std::vector bias = {0.0, 0.0}; std::vector data_c = {3.3, 3.3, 9.6, 6.3, 6.3, 3.3, 3.3, 9.6, 6.3, 6.3, 15.6, 15.6, 37.2, @@ -99,27 +79,23 @@ public: 6.6, 6.6, 19.2, 12.6, 12.6, 6.6, 6.6, 19.2, 12.6, 12.6, 31.2, 31.2, 74.4, 43.2, 43.2, 24.6, 24.6, 55.2, 30.6, 30.6, 24.6, 24.6, 55.2, 30.6, 30.6}; + int ic = 3, oc = 2; - int kw = 3, kh = 3; + int kw = 3, kh = 3, ih = 2, iw = 2; int stride = 2, dilation = 1; - int group = 1; + int group = 1, batch = 1; int pad_w = 0, pad_h = 0; - - auto output = _Deconv(std::move(weight), std::move(bias), input, {ic, oc}, {kw, kh}, VALID, - {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false); - - ::memcpy(input->writeMap(), data_a.data(), data_a.size() * sizeof(float)); - - if (!checkVectorByRelativeError(output->readMap(), data_c.data(), data_c.size(), 0.005)) { - MNN_ERROR("DeconvolutionTest0 test failed!\n"); + + bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "DeconvolutionTest0", data_a, weight, bias, data_c, + batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw, + stride, dilation, group); + if (!succ) { return false; } } MNN_PRINT("beigin testcase 1\n"); { - auto input = _Input({1, 3, 2, 2}, NCHW, halide_type_of()); - std::vector data_a = {// channel 0 1.0, 2.0, 4.0, 5.0, // channel 1 @@ -130,109 +106,19 @@ public: std::vector weight = { // output channel0 // input channel0 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // input channel1 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, // input channel2 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // output channel1 // input channel0 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, // input channel1 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // input channel2 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, }; std::vector bias = {1.0, 2.0}; std::vector data_c = { @@ -241,26 +127,21 @@ public: 8.6, 21.2, 21.2, 14.6, 33.2, 76.4, 76.4, 45.2, 33.2, 76.4, 76.4, 45.2, 26.6, 57.2, 57.2, 32.6, }; int ic = 3, oc = 2; - int kw = 4, kh = 4; + int kw = 4, kh = 4, ih = 2, iw = 2; int stride = 2, dilation = 1; - int group = 1; + int group = 1, batch = 1; int pad_w = 1, pad_h = 1; - - auto output = _Deconv(std::move(weight), std::move(bias), input, {ic, oc}, {kw, kh}, VALID, - {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false); - - ::memcpy(input->writeMap(), data_a.data(), data_a.size() * sizeof(float)); - - if (!checkVectorByRelativeError(output->readMap(), data_c.data(), data_c.size(), 0.005)) { - MNN_ERROR("DeconvolutionTest1 test failed!\n"); + + bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "Deconv", data_a, weight, bias, data_c, + batch, ic, oc, ih, iw, PadMode_VALID, pad_h, pad_w, kh, kw, + stride, dilation, group); + if (!succ) { return false; } } MNN_PRINT("beigin testcase 2\n"); { - auto input = _Input({1, 3, 2, 2}, NCHW, halide_type_of()); - std::vector data_a = {// channel 0 1.0, 2.0, 4.0, 5.0, // channel 1 @@ -271,67 +152,19 @@ public: std::vector weight = { // output channel0 // input channel0 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // input channel1 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, // input channel2 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // output channel1 // input channel0 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, // input channel1 - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // input channel2 - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, }; std::vector bias = {0.0, 0.0}; std::vector data_c = {3.3, 3.3, 9.6, 6.3, 3.3, 3.3, 9.6, 6.3, 15.6, 15.6, 37.2, @@ -340,18 +173,15 @@ public: 6.6, 6.6, 19.2, 12.6, 6.6, 6.6, 19.2, 12.6, 31.2, 31.2, 74.4, 43.2, 24.6, 24.6, 55.2, 30.6}; int ic = 3, oc = 2; - int kw = 3, kh = 3; + int kw = 3, kh = 3, ih = 2, iw = 2; int stride = 2, dilation = 1; - int group = 1; + int group = 1, batch = 1; int pad_w = 0, pad_h = 0; - auto output = _Deconv(std::move(weight), std::move(bias), input, {ic, oc}, {kw, kh}, SAME, - {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false); - - ::memcpy(input->writeMap(), data_a.data(), data_a.size() * sizeof(float)); - - if (!checkVectorByRelativeError(output->readMap(), data_c.data(), data_c.size(), 0.005)) { - MNN_ERROR("DeconvolutionTest2 test failed!\n"); + bool succ = DeconvolutionCommonTest::test(MNN_FORWARD_CPU, "CPU", "Deconv", data_a, weight, bias, data_c, + batch, ic, oc, ih, iw, PadMode_SAME, pad_h, pad_w, kh, kw, + stride, dilation, group); + if (!succ) { return false; } } @@ -360,3 +190,4 @@ public: } }; MNNTestSuiteRegister(DeconvolutionTest, "op/Deconvolution"); + diff --git a/test/op/GridSampleTest.cpp b/test/op/GridSampleTest.cpp new file mode 100644 index 00000000..cdbf3edf --- /dev/null +++ b/test/op/GridSampleTest.cpp @@ -0,0 +1,253 @@ +// +// CropAndResizeTest.cpp +// MNNTests +// +// Created by MNN on 2021/03/11. +// Copyright © 2018, Alibaba Group Holding Limited +// +#include +#include +#include + +#include +#include +#include +#include "MNNTestSuite.h" +#include "TestUtils.h" + +using namespace MNN::Express; + +static float getPosition(float x, int range, bool alignCorners, GridSamplePaddingMode paddingMode) { + if (paddingMode == GRID_SAMPLE_PADDING_REFLECTION) { + // if x is on the left side of -1.0, move it to the right side of 1.0 + if (x < -1.0f) { + x = x + ::ceil(1 - x) * 4; + } + // reflect + if (x > 1.0f) { + float l = x - 1.0f; + int reflectionNum = ::floor(l / 2.0); + float offset = l - reflectionNum * 2.0f; + x = (reflectionNum % 2 == 0) ? (1 - offset) : (-1.0f + offset); + } + } + + float a = alignCorners ? 1.0f : 0.0f; + float b = alignCorners ? 0.0f : 1.0f; + return ((1 + x) * (range - a) - b) / 2.0f; +} + +static int CLAMP(int v, int min, int max) { + if ((v) < min) { + (v) = min; + } else if ((v) > max) { + (v) = max; + } + return v; +} + +static float sample(int h, int w, const float *buffer, int height, int width, GridSamplePaddingMode paddingMode) { + if (h < 0 || h >= height || w < 0 || w >= width) { + if (paddingMode == GRID_SAMPLE_PADDING_ZEROS) { + return 0.0f; + } + // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER + // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1), + // the leftover reflections degrade to GridSamplePaddingMode_BORDER + h = CLAMP(h, 0, height-1); + w = CLAMP(w, 0, width-1); + } + + return buffer[h * width + w]; +} + +static float interpolate(float h, float w, const float *buffer, int height, int width, InterpolationMethod mode, + GridSamplePaddingMode paddingMode) { + if (mode == NEAREST) { + int nh = ::floor(h+0.5f); + int nw = ::floor(w+0.5f); + return sample(nh, nw, buffer, height, width, paddingMode); + } + + // mode == GridSampleMode_BILINEAR + int w0_h = ::floor(h); + int w0_w = ::floor(w); + int w1_h = w0_h + 1; + int w1_w = w0_w + 1; + + float i00 = sample(w0_h, w0_w, buffer, height, width, paddingMode); + float i01 = sample(w0_h, w1_w, buffer, height, width, paddingMode); + float i10 = sample(w1_h, w0_w, buffer, height, width, paddingMode); + float i11 = sample(w1_h, w1_w, buffer, height, width, paddingMode); + + float i0 = i00 * (w1_w - w) + i01 * (w - w0_w); + float i1 = i10 * (w1_w - w) + i11 * (w - w0_w); + + return i0 * (w1_h - h) + i1 * (h - w0_h); +} + +static void reference_grid_sample(const float *inputPtr, const float *gridPtr, std::vector &output, + int batch, int inHeight, int inWidth, int outHeight, int outWidth, int depth, + InterpolationMethod mode, GridSamplePaddingMode paddingMode, bool alignCorners) { + output.resize(batch * outHeight * outWidth * depth); + + float *outputPtr = output.data(); + for (auto b = 0; b < batch; ++b) { + const float *_inputPtr = inputPtr + b * inHeight * inWidth * depth; + const float *_gridPtr = gridPtr + b * outHeight * outWidth * 2; + float *_outputPtr = outputPtr + b * outHeight * outWidth * depth; + + for (auto c = 0; c < depth; ++c) { + auto __inputPtr = _inputPtr + c * inHeight * inWidth; + auto __outputPtr = _outputPtr + c * outHeight * outWidth; + + for (auto h = 0; h < outHeight; ++h) { + auto __gridPtr = _gridPtr + h * outWidth * 2; + auto ___outputPtr = __outputPtr + h * outWidth; + + for (auto w = 0; w < outWidth; ++w) { + auto x = getPosition(__gridPtr[2 * w + 0], inWidth, alignCorners, paddingMode); + auto y = getPosition(__gridPtr[2 * w + 1], inHeight, alignCorners, paddingMode); + + ___outputPtr[w] = interpolate(y, x, __inputPtr, inHeight, inWidth, mode, paddingMode); + } + } + } + } + +} + +/** + @brief check the result with the ground truth + @param result data + @param rightData + @param size + @param threshold + */ +template +bool checkVector(const T* result, const T* rightData, int size, T threshold, T ratio){ + MNN_ASSERT(result != nullptr); + MNN_ASSERT(rightData != nullptr); + MNN_ASSERT(size >= 0); + int count = 0; + for(int i = 0; i < size; ++i){ + if(fabs(result[i] - rightData[i]) > threshold){ + //std::cout << "right: " << rightData[i] << ", compute: " << result[i] << std::endl; + count ++; + } + } + + float miss_match_ratio = 1.0f*count/size; + if (miss_match_ratio > ratio) { + std::cout << "ratio threshold: " << ratio << ", miss match ratio: " << miss_match_ratio << std::endl; + return false; + } + + return true; +} + + +class GridSampleTest : public MNNTestCase { +public: + virtual ~GridSampleTest() = default; + + virtual bool run() { + const std::vector> configs({ + {1, 3, 5, 10, 5, 10}, + {1, 62, 6, 10, 12, 20}, + {2, 64, 12, 20, 6, 6}, + {1, 3, 384, 640, 384, 640}, + }); + + for (auto config : configs) { + const int batch = config[0]; + const int depth = config[1]; + const int inHeight = config[2]; + const int inWidth = config[3]; + const int outHeight = config[4]; + const int outWidth = config[5]; + + auto input = _Input({batch, depth, inHeight, inWidth}, NCHW); + auto grid = _Input({batch, outHeight, outWidth, 2}, NHWC); + + auto inputPtr = input->writeMap(); + auto gridPtr = grid->writeMap(); + + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::normal_distribution<> inputDist{0.0f, 1.0}; + std::normal_distribution<> gridDist{0.0f, 3.0f / outWidth}; + + for (int i = 0; i < batch * inHeight * inWidth * depth; i++) { + inputPtr[i] = inputDist(gen); + } + for (int b = 0; b < batch; b++) { + for (int h = 0; h < outHeight; h++) { + for (int w = 0; w < outWidth; w++) { + float offsetH = gridDist(gen); + float offsetW = gridDist(gen); + gridPtr[b * outHeight * outWidth * 2 + h * outWidth * 2 + w * 2 + 0] = + 2.0f * w / (outWidth-1) - 1.0f + offsetW; + gridPtr[b * outHeight * outWidth * 2 + h * outWidth * 2 + w * 2 + 1] = + 2.0f * h / (outHeight-1) - 1.0f + offsetH; + } + } + } + + std::vector modes({BILINEAR}); + std::vector paddingModes({GRID_SAMPLE_PADDING_ZEROS}); + std::vector alignCornersVec({false}); + +#define MNN_METAL_FULL_PRECISION 0 +#if MNN_METAL_FULL_PRECISION + bool usingMetalLowPrecision = false; +#else + auto runtime = MNN::Express::Executor::getGlobalExecutor()->getRuntime(); + bool usingMetalLowPrecision = runtime.first.find(MNN_FORWARD_METAL) != runtime.first.end(); +#endif + + std::vector expectedOutput(batch * outHeight * outWidth * depth); + for (auto mode : modes) { + for (auto paddingMode : paddingModes) { + for (auto alignCorners : alignCornersVec) { + reference_grid_sample(inputPtr, gridPtr, expectedOutput, + batch, inHeight, inWidth, outHeight, outWidth, depth, + mode, paddingMode, alignCorners); + auto expectedOutPtr = expectedOutput.data(); + + grid->unMap(); + input->unMap(); + input = _Convert(input, NC4HW4); + + auto output = _GridSample(input, grid, mode, paddingMode, alignCorners); + output = _Convert(output, NCHW); + auto outputPtr = output->readMap(); + + if (usingMetalLowPrecision) { + if (!checkVector(outputPtr, expectedOutPtr, expectedOutput.size(), 0.2, 0.01)) { + MNN_ERROR("GridSampleTest test failed!\n"); + return false; + } + } else { + if (mode == NEAREST) { + if (!checkVector(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01, 0.001)) { + MNN_ERROR("GridSampleTest NEAREST test failed!\n"); + return false; + } + } else { + if (!checkVector(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01)) { + MNN_ERROR("GridSampleTest BILINEAR test failed!\n"); + return false; + } + } + } + } + } + } + } + + return true; + } +}; + +MNNTestSuiteRegister(GridSampleTest, "op/GridSample"); diff --git a/test/op/MatMulTest.cpp b/test/op/MatMulTest.cpp index 9857e83e..4c7d57b6 100644 --- a/test/op/MatMulTest.cpp +++ b/test/op/MatMulTest.cpp @@ -78,7 +78,7 @@ protected: ::memcpy(input_a->writeMap(), data_a.data(), data_a.size() * sizeof(float)); ::memcpy(input_b->writeMap(), data_b.data(), data_b.size() * sizeof(float)); auto outputPtr = output->readMap(); - if (!checkVectorByRelativeError(outputPtr, data_c.data(), data_c.size(), 0.005)) { + if (!checkVectorByRelativeError(outputPtr, data_c.data(), data_c.size(), 0.05)) { MNN_ERROR("%s: %d x %d - %d x %d -> %d, %d , transpose: %d, %d, test failed!\n", test_op_name.c_str(), width_a, height_a, width_b, height_b, output->getInfo()->dim[1], output->getInfo()->dim[0], tranpose_a, tranpose_b); diff --git a/test/op/MultiDeconvolutionTest.cpp b/test/op/MultiDeconvolutionTest.cpp index 294192e3..9e726761 100644 --- a/test/op/MultiDeconvolutionTest.cpp +++ b/test/op/MultiDeconvolutionTest.cpp @@ -83,7 +83,7 @@ protected: if (!checkVectorByRelativeError(outputPtr, outputData.data(), outputData.size(), 0.005)) { MNN_ERROR("MultiDeconvolution(%s) test failed!\n", deviceName.c_str()); for (int v = 0; v < outputData.size(); ++v) { - MNN_ERROR("Corret:%f, Error:%f\n", outputData[v], outputPtr[v]); + MNN_ERROR("Correct:%f, Error:%f\n", outputData[v], outputPtr[v]); } return false; } diff --git a/test/op/ReluGradTest.cpp b/test/op/ReluGradTest.cpp deleted file mode 100644 index 26685273..00000000 --- a/test/op/ReluGradTest.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// -// ReluGradTest.cpp -// MNNTests -// -// Created by MNN on 2019/10/16. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include -#include -#include -#include -#include -#include -#include "MNNTestSuite.h" -#include "MNN_generated.h" -#include "TestUtils.h" - -using namespace MNN::Express; - -static VARP _ReluGrad(VARP originInput, VARP inputGrad) { - using namespace MNN; - std::unique_ptr relu(new OpT); - relu->type = OpType_ReluGrad; - relu->main.type = OpParameter_Relu; - relu->main.value = new ReluT; - relu->main.AsRelu()->slope = 0.0f; - return Variable::create(Expr::create(std::move(relu), {originInput, inputGrad})); -} - -static VARP _Relu6Grad(VARP originInput, VARP inputGrad) { - using namespace MNN; - std::unique_ptr relu6(new OpT); - relu6->type = OpType_Relu6Grad; - relu6->main.type = OpParameter_Relu6; - relu6->main.value = new Relu6T; - return Variable::create(Expr::create(std::move(relu6), {originInput, inputGrad})); -} - -class ReluGradTest : public MNNTestCase { -public: - virtual ~ReluGradTest() = default; - -protected: - bool testOnBackend(MNNForwardType type, const std::string& deviceName) { - const int h = 4, w = 4, size = h * w; - const std::vector originInputData = {6.2025, -0.0156, 0.0765, 6.1872, 0.0455, 6.3100, 0.0162, -0.1304, - -0.0330, 0.0641, 6.2964, 0.0452, 0.2203, -0.0665, 0.1727, 0.1119}; - const std::vector inputGradData = {1., 2., 3., 4., 2., 3., 4., 1., 3., 4., 1., 2., 4., 1., 2., 3.}; - std::vector reluExpectedGrad(size), relu6ExpectedGrad(size); - for (int i = 0; i < size; ++i) { - bool positive = (originInputData[i] > 0); - bool under6 = (originInputData[i] < 6); - reluExpectedGrad[i] = (positive ? inputGradData[i] : 0); - relu6ExpectedGrad[i] = ((positive && under6) ? inputGradData[i] : 0); - } - - auto input = _Input({1, 1, h, w}, NCHW, halide_type_of()); - auto inputGrad = _Input({1, 1, h, w}, NCHW, halide_type_of()); - auto inputConvert = _Convert(input, NC4HW4); - auto inputGradConvert = _Convert(inputGrad, NC4HW4); - auto reluGrad = _Convert(_ReluGrad(inputConvert, inputGradConvert), NCHW); - auto relu6Grad = _Convert(_Relu6Grad(inputConvert, inputGradConvert), NCHW); - - const std::vector outDim = {1, 1, h, w}; - auto reluGradDim = reluGrad->getInfo()->dim; - auto relu6GradDim = relu6Grad->getInfo()->dim; - if (!checkVector(reluGradDim.data(), outDim.data(), 4, 0)) { - MNN_ERROR("ReluGrad(%s) shape test failed!\n", deviceName.c_str()); - return false; - } - if (!checkVector(relu6GradDim.data(), outDim.data(), 4, 0)) { - MNN_ERROR("Relu6Grad(%s) shape test failed!\n", deviceName.c_str()); - return false; - } - - ::memcpy(input->writeMap(), originInputData.data(), size * sizeof(float)); - ::memcpy(inputGrad->writeMap(), inputGradData.data(), size * sizeof(float)); - if (!checkVector(reluGrad->readMap(), reluExpectedGrad.data(), size, 1e-6)) { - MNN_ERROR("ReluGrad(%s) test failed!\n", deviceName.c_str()); - return false; - } - if (!checkVector(relu6Grad->readMap(), relu6ExpectedGrad.data(), size, 1e-6)) { - MNN_ERROR("Relu6Grad(%s) test failed!\n", deviceName.c_str()); - return false; - } - return true; - } -}; - -class ReluGradTestOnCPU : public ReluGradTest { -public: - virtual ~ReluGradTestOnCPU() = default; - virtual bool run() { - return testOnBackend(MNN_FORWARD_CPU, "CPU"); - } -}; - -class ReluGradTestOnOpencl : public ReluGradTest { -public: - virtual ~ReluGradTestOnOpencl() = default; - virtual bool run() { - return testOnBackend(MNN_FORWARD_OPENCL, "OPENCL"); - } -}; - -MNNTestSuiteRegister(ReluGradTestOnCPU, "op/ReluGrad"); diff --git a/test/op/ReverseTest.cpp b/test/op/ReverseTest.cpp new file mode 100644 index 00000000..7c4104bd --- /dev/null +++ b/test/op/ReverseTest.cpp @@ -0,0 +1,72 @@ +// +// ReverseTest.cpp +// MNNTests +// +// Created by MNN on 2021/02/20. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include +#include +#include "MNNTestSuite.h" +#include "TestUtils.h" + +using namespace MNN::Express; +class ReverseTest : public MNNTestCase { +public: + virtual ~ReverseTest() = default; + virtual bool run() { + auto input = _Input({3, 2, 3}, NCHW); + input->setName("input_tensor"); + // set input data + const float inpudata[] = { 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18 }; + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inpudata, 18 * sizeof(float)); + auto output0 = _Reverse(input, _Scalar(0)); + const std::vector expectedOutput0 = { 13, 14, 15, 16, 17, 18, + 7, 8, 9, 10, 11, 12, + 1, 2, 3, 4, 5, 6 }; + auto gotOutput0 = output0->readMap(); + for (int i = 0; i < 18; ++i) { + auto diff = ::fabsf(gotOutput0[i] - expectedOutput0[i]); + if (diff > 0.01) { + MNN_ERROR("ReverseTest[axis=0] test failed: %f - %f!\n", expectedOutput0[i], gotOutput0[i]); + return false; + } + } + auto output1 = _Reverse(input, _Scalar(1)); + const std::vector expectedOutput1 = { 4, 5, 6, 1, 2, 3, + 10, 11, 12, 7, 8, 9, + 16, 17, 18, 13, 14, 15 }; + auto gotOutput1 = output1->readMap(); + for (int i = 0; i < 18; ++i) { + auto diff = ::fabsf(gotOutput1[i] - expectedOutput1[i]); + if (diff > 0.01) { + MNN_ERROR("ReverseTest[axis=1] test failed: %f - %f!\n", expectedOutput1[i], gotOutput1[i]); + return false; + } + } + auto output2 = _Reverse(input, _Scalar(2)); + const std::vector expectedOutput2 = { 3, 2, 1, 6, 5, 4, + 9, 8, 7, 12, 11, 10, + 15, 14, 13, 18, 17, 16 }; + auto gotOutput2 = output2->readMap(); + for (int i = 0; i < 18; ++i) { + auto diff = ::fabsf(gotOutput2[i] - expectedOutput2[i]); + if (diff > 0.01) { + MNN_ERROR("ReverseTest[axis=2] test failed: %f - %f!\n", expectedOutput2[i], gotOutput2[i]); + return false; + } + } + return true; + } +private: + VARP _Reverse(VARP x, VARP axis) { + std::unique_ptr op(new MNN::OpT); + op->type = MNN::OpType_Reverse; + return (Variable::create(Expr::create(op.get(), {x, axis}))); + } +}; +MNNTestSuiteRegister(ReverseTest, "op/reverse"); diff --git a/test/op/SelectTest.cpp b/test/op/SelectTest.cpp index 362dd0d9..45589729 100644 --- a/test/op/SelectTest.cpp +++ b/test/op/SelectTest.cpp @@ -57,6 +57,11 @@ bool RunSelectAndCheckResult(VARP select, VARP input0, VARP input1) { auto output = _Select(select, input0, input1); MNN_ASSERT(Size(input0) == Size(output)); + int iter0 = input0->getInfo()->size == 1 ? 0 : 1; + int iter1 = input1->getInfo()->size == 1 ? 0 : 1; + auto outputPtr = output->readMap(); + auto input0Ptr = input0->readMap(); + auto input1Ptr = input1->readMap(); for (int i = 0; i < Size(output); ++i) { int condition = select->readMap()[0]; // TODO(houjiang): Correct Select. @@ -64,9 +69,13 @@ bool RunSelectAndCheckResult(VARP select, VARP input0, VARP input1) { condition = select->readMap()[i]; } if (condition) { - CHECK_EQ_OR_RETURN(output, input0, i); + if (input0Ptr[i * iter0] != outputPtr[i]) { + return false; + } } else { - CHECK_EQ_OR_RETURN(output, input1, i); + if (input1Ptr[i * iter1] != outputPtr[i]) { + return false; + } } } return true; @@ -97,6 +106,11 @@ bool SelectTester4D(int N, int C, int H, int W) { auto select = _Input({1}, NCHW); CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); } + { + auto select = _Input({N, C, H, W}, NCHW); + auto input0 = _Input({1}, NCHW); + CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); + } return true; } diff --git a/test/op/SoftmaxGradTest.cpp b/test/op/SoftmaxGradTest.cpp deleted file mode 100644 index 8aa86be9..00000000 --- a/test/op/SoftmaxGradTest.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// -// SoftmaxGradTest.cpp -// MNNTests -// -// Created by MNN on 2019/10/16. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include -#include -#include -#include -#include -#include -#include "MNNTestSuite.h" -#include "MNN_generated.h" -#include "TestUtils.h" - -using namespace MNN::Express; - -static VARP _SoftmaxGrad(VARP originOutput, VARP outputGrad, int axis) { - using namespace MNN; - std::unique_ptr softmax(new OpT); - softmax->type = OpType_SoftmaxGrad; - softmax->main.type = OpParameter_Axis; - softmax->main.value = new AxisT; - softmax->main.AsAxis()->axis = axis; - return Variable::create(Expr::create(std::move(softmax), {originOutput, outputGrad})); -} - -class SoftmaxGradTest : public MNNTestCase { -public: - virtual ~SoftmaxGradTest() = default; - -protected: - bool testOnBackend(MNNForwardType type, const std::string &deviceName) { - const int batch = 4, channel = 4, size = batch * channel; - float originOutputData[batch][channel] = { - {0.2, 0.23, 0.3, 0.27}, {0.18, 0.33, 0.16, 0.33}, {0.15, 0.18, 0.35, 0.32}, {0.29, 0.18, 0.22, 0.31}}; - float outputGradData[batch][channel] = {{1., 2., 3., 4.}, {2., 3., 4., 1.}, {3., 4., 1., 2.}, {4., 1., 2., 3.}}; - float expectGrad[batch][channel]; - for (int b = 0; b < batch; ++b) { - float sum = 0; - for (int c = 0; c < channel; ++c) { - sum += originOutputData[b][c] * outputGradData[b][c]; - } - for (int c = 0; c < channel; ++c) { - expectGrad[b][c] = originOutputData[b][c] * (outputGradData[b][c] - sum); - } - } - - auto output = _Input({batch, channel}, NCHW, halide_type_of()); - auto outputGrad = _Input({batch, channel}, NCHW, halide_type_of()); - auto outputConvert = _Convert(output, NC4HW4); - auto outputGradConvert = _Convert(outputGrad, NC4HW4); - auto softmaxGrad = _Convert(_SoftmaxGrad(outputConvert, outputGradConvert, 1), NCHW); - - if (type != MNN_FORWARD_CPU) { - Optimizer::Config config; - config.forwardType = type; - auto optimizer = Optimizer::create(config); - if (optimizer == nullptr) { - MNN_ERROR("backend %s not support\n", deviceName.c_str()); - return false; - } - optimizer->onExecute({softmaxGrad}); - } - - const std::vector outDim = {batch, channel}; - auto softmaxGradDim = softmaxGrad->getInfo()->dim; - if (!checkVector(softmaxGradDim.data(), outDim.data(), 2, 0)) { - MNN_ERROR("SoftmaxGrad(%s) shape test failed!\n", deviceName.c_str()); - return false; - } - - ::memcpy(output->writeMap(), (const float *)originOutputData, size * sizeof(float)); - ::memcpy(outputGrad->writeMap(), (const float *)outputGradData, size * sizeof(float)); - auto compute = softmaxGrad->readMap(); - if (!checkVectorByRelativeError(compute, (const float *)expectGrad, size, 0.005)) { - MNN_ERROR("SoftmaxGrad(%s) test failed!\n", deviceName.c_str()); - return false; - } - return true; - } -}; - -class SoftmaxGradTestOnCPU : public SoftmaxGradTest { -public: - virtual ~SoftmaxGradTestOnCPU() = default; - virtual bool run() { - return testOnBackend(MNN_FORWARD_CPU, "CPU"); - } -}; - -MNNTestSuiteRegister(SoftmaxGradTestOnCPU, "op/SoftmaxGrad"); diff --git a/test/op/StridedSliceTest.cpp b/test/op/StridedSliceTest.cpp index 4f9bf6f4..59bc5b27 100644 --- a/test/op/StridedSliceTest.cpp +++ b/test/op/StridedSliceTest.cpp @@ -73,6 +73,21 @@ public: MNN_ERROR("stridedslice (ellipsisMask=2, shrinkAxisMask=4) test failed!\n"); return false; } + // 6. beginMask = 9, endMask = 15 + const int begin_data6[] = {0, 1, 1, 0}; + memcpy(begin->writeMap(), begin_data6, 4 * sizeof(int)); + const int end_data6[] = {0, 0, 0, 0}; + memcpy(end->writeMap(), end_data6, 4 * sizeof(int)); + const int stride_data6[] = {1, 1, 1, 1}; + memcpy(strided->writeMap(), stride_data6, 4 * sizeof(int)); + auto output_6 = _StridedSlice(input, begin, end, strided, 9, 15, 0, 0, 0); + const std::vector expectedShape_6 = {1, 2, 1, 3}; + const std::vector expectedOutput_6 = {4, 4, 4, 6, 6, 6}; + if (!checkVector(output_6->getInfo()->dim.data(), expectedShape_6.data(), expectedShape_6.size(), 0) || + !checkVector(output_6->readMap(), expectedOutput_6.data(), expectedOutput_6.size(), 0.01)) { + MNN_ERROR("stridedslice (beginMask=9, endMask=15) test failed!\n"); + return false; + } return true; } }; diff --git a/test/op/UnaryTest.cpp b/test/op/UnaryTest.cpp index a42d0c9d..1d804c95 100644 --- a/test/op/UnaryTest.cpp +++ b/test/op/UnaryTest.cpp @@ -6,760 +6,331 @@ // Copyright © 2018, Alibaba Group Holding Limited // +#include #include #include #include "MNNTestSuite.h" #include "TestUtils.h" using namespace MNN::Express; -class AbsTest : public MNNTestCase { +using namespace std; + +class UnaryTestCommon : public MNNTestCase { +protected: + template + bool test(VARP (*opFunc)(VARP), string name, Tout threshold, const vector& data_in, + const vector& data_out, const vector& shape_in, const vector& shape_out) { + int size_in = 1, size_out = 1; + for (int i = 0; i < shape_in.size(); ++i) { + size_in *= shape_in[i]; + } + for (int i = 0; i < shape_out.size(); ++i) { + size_out *= shape_out[i]; + } + + auto input = _Input(shape_in, NCHW, halide_type_of()); + input->setName("input_tensor"); + // set input data + auto ptr_in = input->template writeMap(); + memcpy(ptr_in, data_in.data(), size_in * sizeof(Tin)); + input->unMap(); + auto output = opFunc(input); + auto gotOutput = output->template readMap(); + + auto shape_got = output->getInfo()->dim; + if (shape_got.size() != shape_out.size()) { + MNN_ERROR("%s shape compute error!\n", name.c_str()); + return false; + } + for (int i = 0; i < shape_got.size(); i++) { + if (shape_got[i] != shape_out[i]) { + MNN_ERROR("%s shape compute error!\n", name.c_str()); + return false; + } + } + + if (!checkVector(gotOutput, data_out.data(), size_out, threshold)) { + MNN_ERROR("%s test failed!\n", name.c_str()); + return false; + } + return true; + } +}; + +class AbsTest : public UnaryTestCommon { public: virtual ~AbsTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, -2.0, 3.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Abs(input); - const std::vector expectedOutput = {1.0, 2.0, 3.0, 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AbsTest test failed!\n"); - return false; - } - return true; + return test(_Abs, "AbsTest", 0.01, + {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0}, + {8}, {8}); } }; -class NegativeTest : public MNNTestCase { +class NegativeTest : public UnaryTestCommon { public: virtual ~NegativeTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, -2.0, 3.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Negative(input); - const std::vector expectedOutput = {1.0, 2.0, -3.0, -4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("NegativeTest test failed!\n"); - return false; - } - return true; + return test(_Negative, "NegativeTest", 0.01, + {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 2.0, -3.0, -4.0, 1.0, 2.0, -3.0, -4.0}, + {8}, {8}); } }; -class FloorTest : public MNNTestCase { +class FloorTest : public UnaryTestCommon { public: virtual ~FloorTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.3, -2.6, 3.2, 4.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Floor(input); - const std::vector expectedOutput = {-2.0, -3.0, 3.0, 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("FloorTest test failed!\n"); - return false; - } - return true; + return test(_Floor, "FloorTest", 0.01, + {-1.3, -2.6, 3.2, 4.6}, {-2.0, -3.0, 3.0, 4.0}, + {4}, {4}); } }; -class CeilTest : public MNNTestCase { +class CeilTest : public UnaryTestCommon { public: virtual ~CeilTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.3, -2.6, 3.2, 4.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Ceil(input); - const std::vector expectedOutput = {-1.0, -2.0, 4.0, 5.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("CeilTest test failed!\n"); - return false; - } - return true; + return test(_Ceil, "CeilTest", 0.01, + {-1.3, -2.6, 3.2, 4.6}, {-1.0, -2.0, 4.0, 5.0}, + {4}, {4}); } }; -class SquareTest : public MNNTestCase { +class SquareTest : public UnaryTestCommon { public: virtual ~SquareTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, -2.0, 3.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Square(input); - const std::vector expectedOutput = {1.0, 4.0, 9.0, 16.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SquareTest test failed!\n"); - return false; - } - return true; + return test(_Square, "SquareTest", 0.01, + {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 4.0, 9.0, 16.0, 1.0, 4.0, 9.0, 16.0}, + {8}, {8}); } }; -class SqrtTest : public MNNTestCase { +class SqrtTest : public UnaryTestCommon { public: virtual ~SqrtTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {1.0, 4.0, 9.0, 16.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Sqrt(input); - const std::vector expectedOutput = {1.0, 2.0, 3.0, 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SqrtTest test failed!\n"); - return false; - } - return true; + return test(_Sqrt, "SqrtTest", 0.01, + {1.0, 4.0, 9.0, 16.0, 1.0, 4.0, 9.0, 16.0}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0}, + {8}, {8}); } }; -class RsqrtTest : public MNNTestCase { +class RsqrtTest : public UnaryTestCommon { public: virtual ~RsqrtTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {1.0, 4.0, 9.0, 16.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Rsqrt(input); - const std::vector expectedOutput = {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("RsqrtTest test failed!\n"); - return false; - } - return true; + return test(_Rsqrt, "RsqrtTest", 0.01, + {1.0, 4.0, 9.0, 16.0, 1.0, 4.0, 9.0, 16.0}, + {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0, 1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 4.0}, + {8}, {8}); } }; -class ExpTest : public MNNTestCase { +class ExpTest : public UnaryTestCommon { public: virtual ~ExpTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {1.0, 2.0, 3.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Exp(input); - const std::vector expectedOutput = {2.718, 7.389, 20.086, 54.598}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("ExpTest test failed!\n"); - return false; - } - return true; + return test(_Exp, "ExpTest", 0.01, + {1.0, 2.0, 3.0, 4.0}, {2.718, 7.389, 20.086, 54.598}, + {4}, {4}); } }; -class LogTest : public MNNTestCase { +class LogTest : public UnaryTestCommon { public: virtual ~LogTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {2.718, 7.389, 20.086, 54.598}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Log(input); - const std::vector expectedOutput = {1.0, 2.0, 3.0, 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("LogTest test failed!\n"); - return false; - } - return true; + return test(_Log, "LogTest", 0.01, + {2.718, 7.389, 20.086, 54.598}, {1.0, 2.0, 3.0, 4.0}, + {4}, {4}); } }; -class SinTest : public MNNTestCase { +class SinTest : public UnaryTestCommon { public: virtual ~SinTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Sin(input); - const std::vector expectedOutput = {0.0, 1.0, 0.0, -1.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SinTest test failed!\n"); - return false; - } - return true; + return test(_Sin, "SinTest", 0.01, + {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0}, {0.0, 1.0, 0.0, -1.0}, + {4}, {4}); } }; -class CosTest : public MNNTestCase { +class CosTest : public UnaryTestCommon { public: virtual ~CosTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Cos(input); - const std::vector expectedOutput = {1.0, 0.0, -1.0, 0.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("CosTest test failed!\n"); - return false; - } - return true; + return test(_Cos, "CosTest", 0.01, + {0.0, 3.14 / 2.0, 3.14, 3.14 * 3.0 / 2.0}, {1.0, 0.0, -1.0, 0.0}, + {4}, {4}); } }; -class TanTest : public MNNTestCase { +class TanTest : public UnaryTestCommon { public: virtual ~TanTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {100.0, 200.0, 300.0, 400.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Tan(input); - const std::vector expectedOutput = {-0.59, -1.79, 45.24, 1.62}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("TanTest test failed!\n"); - return false; - } - return true; + return test(_Tan, "TanTest", 0.01, + {100.0, 200.0, 300.0, 400.0}, {-0.59, -1.79, 45.24, 1.62}, + {4}, {4}); } }; -class AsinTest : public MNNTestCase { +class AsinTest : public UnaryTestCommon { public: virtual ~AsinTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, 0.0, 1.0, 0.707}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Asin(input); - const std::vector expectedOutput = {-3.14 / 2.0, 0.0, 3.14 / 2.0, 3.14 / 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AsinTest test failed!\n"); - return false; - } - return true; + return test(_Asin, "AsinTest", 0.01, + {-1.0, 0.0, 1.0, 0.707}, {-3.14 / 2.0, 0.0, 3.14 / 2.0, 3.14 / 4.0}, + {4}, {4}); } }; -class AcosTest : public MNNTestCase { +class AcosTest : public UnaryTestCommon { public: virtual ~AcosTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, 0.0, 1.0, 0.707}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Acos(input); - const std::vector expectedOutput = {3.14, 1.57, 0.0, 3.14 / 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AcosTest test failed!\n"); - return false; - } - return true; + return test(_Acos, "AcosTest", 0.01, + {-1.0, 0.0, 1.0, 0.707}, {3.14, 1.57, 0.0, 3.14 / 4.0}, + {4}, {4}); } }; -class AtanTest : public MNNTestCase { +class AtanTest : public UnaryTestCommon { public: virtual ~AtanTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-2.0, -1.0, 0.0, 1.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Atan(input); - const std::vector expectedOutput = {-1.11, -3.14 / 4.0, 0.0, 3.14 / 4.0}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AtanTest test failed!\n"); - return false; - } - return true; + return test(_Atan, "AtanTest", 0.01, + {-2.0, -1.0, 0.0, 1.0}, {-1.11, -3.14 / 4.0, 0.0, 3.14 / 4.0}, + {4}, {4}); } }; -class ReciprocalTest : public MNNTestCase { +class ReciprocalTest : public UnaryTestCommon { public: virtual ~ReciprocalTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-2.0, -4.0, 2.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Reciprocal(input); - const std::vector expectedOutput = {-0.5, -0.25, 0.50, 0.25}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("ReciprocalTest test failed!\n"); - return false; - } - return true; + return test(_Reciprocal, "ReciprocalTest", 0.01, + {-2.0, -4.0, 2.0, 4.0, -2.0, -4.0, 2.0, 4.0, 4.0}, {-0.5, -0.25, 0.50, 0.25, -0.5, -0.25, 0.50, 0.25, 0.25}, + {9}, {9}); } }; -class Log1PTest : public MNNTestCase { +class Log1PTest : public UnaryTestCommon { public: virtual ~Log1PTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {0.0, 1.0, 2.0, 3.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Log1p(input); - const std::vector expectedOutput = {0.0, 0.69, 1.10, 1.39}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("Log1PTest test failed!\n"); - return false; - } - return true; + return test(_Log1p, "Log1pTest", 0.01, + {0.0, 1.0, 2.0, 3.0}, {0.0, 0.69, 1.10, 1.39}, + {4}, {4}); } }; -class TanhTest : public MNNTestCase { +class TanhTest : public UnaryTestCommon { public: virtual ~TanhTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, 0.0, 1.0, 2.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Tanh(input); - const std::vector expectedOutput = {-0.76, 0.0, 0.76, 0.96}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("TanhTest test failed!\n"); - return false; - } - return true; + return test(_Tanh, "TanhTest", 0.01, + {-1.0f, 0.0f, 1.0f, 2.0f, -98.0f, 90.0f}, {-0.76f, 0.0f, 0.76f, 0.96f, -1.0f, 1.0f}, + {6}, {6}); } }; -class SigmoidTest : public MNNTestCase { +class SigmoidTest : public UnaryTestCommon { public: virtual ~SigmoidTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.0, 0.0, 1.0, 2.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Sigmoid(input); - const std::vector expectedOutput = {0.27, 0.50, 0.73, 0.88}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SigmoidTest test failed!\n"); - return false; + int size = 32; + std::vector data_in(size), data_out(size); + for (int i = 0; i < size; ++i) { + data_in[i] = 0.25 * i - 4; + data_out[i] = 1 / (1 + expf(-data_in[i])); } - return true; + return test(_Sigmoid, "SigmoidTest", 0.01, + data_in, data_out, {size}, {size}); } }; -class AcoshTest : public MNNTestCase { +class AcoshTest : public UnaryTestCommon { public: virtual ~AcoshTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {1.0, 2.0, 3.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Acosh(input); - const std::vector expectedOutput = {0., 1.3169579, 1.76274717, 2.06343707}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AcoshTest test failed!\n"); - return false; - } - return true; + return test(_Acosh, "AcoshTest", 0.01, + {1.0, 2.0, 3.0, 4.0}, {0., 1.3169579, 1.76274717, 2.06343707}, + {4}, {4}); } }; -class AsinhTest : public MNNTestCase { +class AsinhTest : public UnaryTestCommon { public: virtual ~AsinhTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {1.0, 2.0, 3.0, 4.0}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Asinh(input); - const std::vector expectedOutput = {0.88137359, 1.44363548, 1.81844646, 2.09471255}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AsinhTest test failed!\n"); - return false; - } - return true; + return test(_Asinh, "AsinhTest", 0.01, + {1.0, 2.0, 3.0, 4.0}, {0.88137359, 1.44363548, 1.81844646, 2.09471255}, + {4}, {4}); } }; -class AtanhTest : public MNNTestCase { +class AtanhTest : public UnaryTestCommon { public: virtual ~AtanhTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {0., 0.1, 0.2, 0.3}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Atanh(input); - const std::vector expectedOutput = {0., 0.10033535, 0.20273255, 0.3095196}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("AtanhTest test failed!\n"); - return false; - } - return true; + return test(_Atanh, "AtanhTest", 0.01, + {0., 0.1, 0.2, 0.3}, {0., 0.10033535, 0.20273255, 0.3095196}, + {4}, {4}); } }; -class RoundTest : public MNNTestCase { +class RoundTest : public UnaryTestCommon { public: virtual ~RoundTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, -0.6, 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Round(input); - const std::vector expectedOutput = {-1., -1., 0., 2.}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("RoundTest test failed!\n"); - return false; - } - return true; + return test(_Round, "RoundTest", 0.01, + {-1.2, -0.6, 0.4, 1.6}, {-1., -1., 0., 2.}, + {4}, {4}); } }; -class SignTest : public MNNTestCase { +class SignTest : public UnaryTestCommon { public: virtual ~SignTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, 0., 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Sign(input); - const std::vector expectedOutput = {-1., 0., 1., 1.}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SignTest test failed!\n"); - return false; - } - return true; + return test(_Sign, "SignTest", 0.01, + {-1.2, 0., 0.4, 1.6}, {-1., 0., 1., 1.}, + {4}, {4}); } }; -class CoshTest : public MNNTestCase { +class CoshTest : public UnaryTestCommon { public: virtual ~CoshTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, 0., 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Cosh(input); - const std::vector expectedOutput = {1.81065557, 1., 1.08107237, 2.57746447}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("CoshTest test failed!\n"); - return false; - } - return true; + return test(_Cosh, "CoshTest", 0.01, + {-1.2, 0., 0.4, 1.6}, {1.81065557, 1., 1.08107237, 2.57746447}, + {4}, {4}); } }; -class ErfTest : public MNNTestCase { +class ErfTest : public UnaryTestCommon { public: virtual ~ErfTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, 0., 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Erf(input); - const std::vector expectedOutput = {-0.91031396, 0., 0.42839235, 0.9763484}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("ErfTest test failed!\n"); - return false; - } - return true; + return test(_Erf, "ErfTest", 0.01, + {-1.2, 0., 0.4, 1.6}, {-0.91031396, 0., 0.42839235, 0.9763484}, + {4}, {4}); } }; -class ErfcTest : public MNNTestCase { +class ErfcTest : public UnaryTestCommon { public: virtual ~ErfcTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, 0., 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Erfc(input); - const std::vector expectedOutput = {1.910314, 1., 0.57160765, 0.02365161}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("ErfcTest test failed!\n"); - return false; - } - return true; + return test(_Erfc, "ErfcTest", 0.01, + {-1.2, 0., 0.4, 1.6}, {1.910314, 1., 0.57160765, 0.02365161}, + {4}, {4}); } }; -class ErfinvTest : public MNNTestCase { +class ErfinvTest : public UnaryTestCommon { public: virtual ~ErfinvTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {0, 0.4, 0.6, 0.9}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Erfinv(input); - const std::vector expectedOutput = {0., 0.37080714, 0.5951161, 1.1630871}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("ErfinvTest test failed!\n"); - return false; - } - return true; + return test(_Erfinv, "ErfinvTest", 0.01, + {0, 0.4, 0.6, 0.9}, {0., 0.37080714, 0.5951161, 1.1630871}, + {4}, {4}); } }; -class Expm1Test : public MNNTestCase { +class Expm1Test : public UnaryTestCommon { public: virtual ~Expm1Test() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, 0, 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Expm1(input); - const std::vector expectedOutput = {-0.6988058, 0., 0.49182472, 3.9530325}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("Expm1Test test failed!\n"); - return false; - } - return true; + return test(_Expm1, "Expm1Test", 0.01, + {-1.2, 0, 0.4, 1.6}, {-0.6988058, 0., 0.49182472, 3.9530325}, + {4}, {4}); } }; -class SinhTest : public MNNTestCase { +class SinhTest : public UnaryTestCommon { public: virtual ~SinhTest() = default; virtual bool run() { - auto input = _Input( - { - 4, - }, - NCHW); - input->setName("input_tensor"); - // set input data - const float inpudata[] = {-1.2, 0, 0.4, 1.6}; - auto inputPtr = input->writeMap(); - memcpy(inputPtr, inpudata, 4 * sizeof(float)); - input->unMap(); - auto output = _Sinh(input); - const std::vector expectedOutput = {-1.5094614, 0., 0.41075233, 2.375568}; - auto gotOutput = output->readMap(); - if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { - MNN_ERROR("SinhTest test failed!\n"); - return false; - } - return true; + return test(_Sinh, "SinhTest", 0.01, + {-1.2, 0, 0.4, 1.6}, {-1.5094614, 0., 0.41075233, 2.375568}, + {4}, {4}); } }; MNNTestSuiteRegister(AbsTest, "op/unary/abs"); diff --git a/test/speed/GridSampleSpeed.cpp b/test/speed/GridSampleSpeed.cpp new file mode 100644 index 00000000..96875b8a --- /dev/null +++ b/test/speed/GridSampleSpeed.cpp @@ -0,0 +1,80 @@ +// +// CropAndResizeTest.cpp +// MNNTests +// +// Created by MNN on 2021/03/11. +// Copyright © 2018, Alibaba Group Holding Limited +// +#include +#include +#include + +#include +#include + +#define MNN_OPEN_TIME_TRACE +#include + +#include "MNNTestSuite.h" +#include "TestUtils.h" + +using namespace MNN::Express; + +#define BATCH 8 +#define DEPTH 4 +#define WIDTH 720 +#define HEIGHT 720 +#define TIME 10 + +class GridSampleSpeed : public MNNTestCase { +public: + virtual ~GridSampleSpeed() = default; + + virtual bool run() { + const int batch = BATCH; + const int inHeight = HEIGHT; + const int inWidth = WIDTH; + const int outHeight = HEIGHT; + const int outWidth = WIDTH; + const int depth = DEPTH; + auto input = _Input({batch, depth, inHeight, inWidth}, NCHW); + auto grid = _Input({batch, outHeight, outWidth, 2}, NHWC); + + std::vector modes({BILINEAR}); + std::vector paddingModes({GRID_SAMPLE_PADDING_ZEROS}); + std::vector alignCornersVec({false}); + + std::vector expectedOutput(batch * outHeight * outWidth * depth); + for (auto mode : modes) { + std::string modeStr = mode == BILINEAR ? "bilinear" : "nearest"; + for (auto paddingMode : paddingModes) { + std::string paddingModeStr = paddingMode == GRID_SAMPLE_PADDING_ZEROS ? + "zeros" : (paddingMode == GRID_SAMPLE_PADDING_BORDER ? "border" + : "reflection"); + for (auto alignCorners : alignCornersVec) { + std::string alignCornersStr = alignCorners ? "true" : "false"; + +// grid->unMap(); +// input->unMap(); +// input = _Convert(input, NC4HW4); + auto output = _GridSample(input, grid, mode, paddingMode, alignCorners); + MNN_PRINT("Test GridSample for NCHW (%d, %d, %d, %d) x %d with setting %s %s %s \n", + BATCH, DEPTH, HEIGHT, WIDTH, TIME, + modeStr.c_str(), paddingModeStr.c_str(), alignCornersStr.c_str()); + { + AUTOTIME; + for (int i = 0; i < TIME; ++i) { + auto inputPtr = input->writeMap(); + auto gridPtr = grid->writeMap(); + + output->readMap(); + } + } + } + } + } + return true; + } +}; + +MNNTestSuiteRegister(GridSampleSpeed, "speed/GridSample"); diff --git a/tools/MNNPythonOfflineQuant/ReadMe.txt b/tools/MNNPythonOfflineQuant/ReadMe.txt new file mode 100644 index 00000000..f7f5c835 --- /dev/null +++ b/tools/MNNPythonOfflineQuant/ReadMe.txt @@ -0,0 +1,33 @@ +这是用MNN的python接口改造的离线量化工具,适用于如下情况: + 1. 你的模型无法使用MNN离线量化工具tools/quantization进行量化,例如多输入,数据预处理比较复杂 + 2. 你的模型无法使用MNN进行训练量化,受限于MNN的训练能力 + +为了使用这个工具,你需要提供: + 0. 使用 MNNConvert工具加上 --forTraining 将你的模型转换成MNN模型 + 1. 一个 calibration_dataset.py 文件,里面包含了你的校准数据集的定义 + 2. 一个 config.yaml 文件,里面包含了你模型的输入输出的相关信息 + +可以参考提供的 calibration_dataset.py 和 config.yaml 来实现 + +特别注意校准集中返回输入数据的顺序和config文件中输入的顺序应该是对应的 + +使用方法(batch size可以根据你的模型调整): + python mnn_offline_quant.py --mnn_model origin_float_model.mnn --quant_model quant_model.mnn --batch_size 32 + +############################################################################ + +This is a python version of MNN offline quant tool, use this tool when: + 1. you can not use MNN offline quant tool (tools/quantization) to quantize your model, cases like multi-input, complecated preprocessing + 2. you can not use MNN's quant-aware-training (QAT) tool to quantize your model, because of MNN's limited training features + +in order to use this tool, you need to provide: + 0. use --forTraining flag of MNNConvert to convert your model to MNN + 1. a calibration_dataset.py file, in which you define your calibration dataset + 2. a config.yaml file, in which you provide information of inputs and outputs of your model + +you can refer to the example file to write your own. + +please Note, the order of returned input data in your calibration dataset should be aligned with the order of input your provide in your config.yaml file. + +usage of the tool (you can adjust batch size according to your own model): + python mnn_offline_quant.py --mnn_model origin_float_model.mnn --quant_model quant_model.mnn --batch_size 32 diff --git a/tools/MNNPythonOfflineQuant/calibration_dataset.py b/tools/MNNPythonOfflineQuant/calibration_dataset.py new file mode 100644 index 00000000..e1f291ef --- /dev/null +++ b/tools/MNNPythonOfflineQuant/calibration_dataset.py @@ -0,0 +1,106 @@ +import numpy as np +import os +from PIL import Image +import MNN +F = MNN.expr + + +# adapted from pycaffe +def load_image(filename, color=True): + """ + Load an image converting from grayscale or alpha as needed. + + Parameters + ---------- + filename : string + color : boolean + flag for color format. True (default) loads as RGB while False + loads as intensity (if image is already grayscale). + + Returns + ------- + image : an image with type np.float32 in range [0, 1] + of size (H x W x 3) in RGB or + of size (H x W x 1) in grayscale. + """ + img = Image.open(filename) + img = np.array(img) + if img.ndim == 2: + img = img[:, :, np.newaxis] + if color: + img = np.tile(img, (1, 1, 3)) + elif img.shape[2] == 4: + img = img[:, :, :3] + return img + + +def center_crop(image_data, crop_factor): + height, width, channels = image_data.shape + + h_size = int(height * crop_factor) + h_start = int((height - h_size) / 2) + h_end = h_start + h_size + + w_size = int(width * crop_factor) + w_start = int((width - w_size) / 2) + w_end = w_start + w_size + + cropped_image = image_data[h_start:h_end, w_start:w_end, :] + + return cropped_image + + +def resize_image(image, shape): + im = Image.fromarray(image) + im = im.resize(shape) + resized_image = np.array(im) + + return resized_image + + +class CalibrationDataset(MNN.data.Dataset): + ''' + This is demo for Imagenet calibration dataset. like pytorch, you need to overload __getiterm__ and __len__ methods + __getiterm__ should return a sample in F.const, and you should not use batch dimension here + __len__ should return the number of total samples in the calibration dataset + ''' + def __init__(self, image_folder): + super(CalibrationDataset, self).__init__() + self.image_folder = image_folder + self.image_list = os.listdir(image_folder)[0:1000] + + def __getitem__(self, index): + image_name = os.path.join(self.image_folder, self.image_list[index].split(' ')[0]) + + + # preprocess your data here, the following code are for tensorflow mobilenets + image_data = load_image(image_name) + image_data = center_crop(image_data, 0.875) + image_data = resize_image(image_data, (224, 224)) + image_data = (image_data - 127.5) / 127.5 + + + # after preprocessing the data, convert it to MNN data structure + dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC, F.dtype.float) + + ''' + first list for inputs, and may have many inputs, so it's a list + if your model have more than one inputs, add the preprocessed MNN const data to the input list + + second list for targets, also, there may be more than one targets + for calibration dataset, we don't need labels, so leave it blank + + Note that, the input order in the first list should be the same in your 'config.yaml' file. + ''' + + return [dv], [] + + def __len__(self): + # size of the dataset + return len(self.image_list) + + +''' +initialize a CalibrationDataset object, its name should be exactly 'calibration_dataset' +''' +calibration_dataset = CalibrationDataset(image_folder='/data/imagenet_train_images') diff --git a/tools/MNNPythonOfflineQuant/config.yaml b/tools/MNNPythonOfflineQuant/config.yaml new file mode 100644 index 00000000..77e8c986 --- /dev/null +++ b/tools/MNNPythonOfflineQuant/config.yaml @@ -0,0 +1,10 @@ +inputs: + names: + ['input', ] + shapes: + [[1, 3, 224, 224], ] + formats: + ['nchw', ] + +output_names: + ['MobilenetV2/Predictions/Reshape_1', ] diff --git a/tools/MNNPythonOfflineQuant/mnn_offline_quant.py b/tools/MNNPythonOfflineQuant/mnn_offline_quant.py new file mode 100644 index 00000000..91e5caab --- /dev/null +++ b/tools/MNNPythonOfflineQuant/mnn_offline_quant.py @@ -0,0 +1,118 @@ +from __future__ import print_function +import time +import argparse +import numpy as np +import tqdm +import MNN +import yaml +from calibration_dataset import calibration_dataset +from test_dataset import ImagenetDataset + +nn = MNN.nn +F = MNN.expr + + +def get_mnn_format(format_str): + fmt = str.lower(format_str) + if fmt == 'nchw': + return F.NCHW + elif fmt == 'nhwc': + return F.NHWC + elif fmt == 'nc4hw4': + return F.NC4HW4 + else: + raise ValueError("unknown format:", format_str) + +def quant_func(net, dataloader, opt): + net.train(True) + dataloader.reset() + + t0 = time.time() + for i in tqdm.trange(dataloader.iter_number): + example = dataloader.next() + input_data = example[0] + predicts = net.forward(input_data) + # fake update + opt.step(F.const(0.0, [])) + for predict in predicts: + predict.read() + + t1 = time.time() + cost = t1 - t0 + print("Epoch cost: %.3f s." % cost) + + +def main(): + ''' + offline quantization using MNN python api. + + 1. you need to convert your model to mnn model + + 2. you need to provide a calibration dataset by modifying preprocessing steps in + 'calibration_dataset.py' to suit your case. + + 3. you need to provide a config yaml file in which provide input and output information about your model. + ''' + + parser = argparse.ArgumentParser() + parser.add_argument("--mnn_model", type=str, required=True,\ + help="original float MNN model file") + parser.add_argument("--quant_model", type=str, required=True, \ + help="name of quantized model to save") + parser.add_argument("--batch_size", type=int, required=False, default=32,\ + help="calibration batch size") + + args = parser.parse_args() + + mnn_model = args.mnn_model + quant_model = args.quant_model + batch_size = args.batch_size + + dataloader = MNN.data.DataLoader(calibration_dataset, batch_size=batch_size, shuffle=True) + + m = F.load_as_dict(mnn_model) + + inputs_outputs = F.get_inputs_and_outputs(m) + for key in inputs_outputs[0].keys(): + print('input names:\t', key) + for key in inputs_outputs[1].keys(): + print('output names:\t', key) + + config_file = "config.yaml" + f = open(config_file) + config = yaml.load(f) + + # get inputs and outputs + inputs = [] + for name in config['inputs']['names']: + inputs.append(m[name]) + + outputs = [] + for name in config['output_names']: + outputs.append(m[name]) + + input_placeholders = [] + for i in range(len(inputs)): + shape = config['inputs']['shapes'][i] + fmt = config['inputs']['formats'][i] + nnn_format = get_mnn_format(fmt) + input_placeholders.append(F.placeholder(shape, nnn_format)) + + net = nn.load_module(inputs, outputs, True) + + # no use optimizer + opt = MNN.optim.SGD(net, 0.01, 0.9, 0.0005) + + nn.compress.train_quant(net, quant_bits=8) + + quant_func(net, dataloader, opt) + + # save model + net.train(False) + predicts = net.forward(input_placeholders) + print("quantized model save to " + quant_model) + F.save(predicts, quant_model) + + +if __name__ == "__main__": + main() diff --git a/tools/converter/CMakeLists.txt b/tools/converter/CMakeLists.txt index 73e630f6..23f91a6e 100644 --- a/tools/converter/CMakeLists.txt +++ b/tools/converter/CMakeLists.txt @@ -25,13 +25,13 @@ IF(MNN_BUILD_CONVERTER) include_directories(${CMAKE_CURRENT_LIST_DIR}/include) include_directories(${CMAKE_CURRENT_LIST_DIR}/source/tflite/schema) include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include(${CMAKE_CURRENT_LIST_DIR}/source/compression/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/tensorflow/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/onnx/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/caffe/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/MNN/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/optimizer/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/tflite/CMakeLists.txt) - include(${CMAKE_CURRENT_LIST_DIR}/source/compression/CMakeLists.txt) if(MNN_BUILD_TORCHSCRIPT) add_definitions(-DMNN_BUILD_TORCHSCRIPT) include(${CMAKE_CURRENT_LIST_DIR}/source/torchscript/CMakeLists.txt) diff --git a/tools/converter/include/addBizCode.hpp b/tools/converter/include/addBizCode.hpp index aa5c1e22..b2c11476 100644 --- a/tools/converter/include/addBizCode.hpp +++ b/tools/converter/include/addBizCode.hpp @@ -9,9 +9,8 @@ #ifndef ADDBIZCODE_HPP #define ADDBIZCODE_HPP -#include "options.hpp" #include "MNN_generated.h" int addBizCode(const std::string modelFile, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT); + std::unique_ptr& netT); #endif // ADDBIZCODE_HPP diff --git a/tools/converter/include/caffeConverter.hpp b/tools/converter/include/caffeConverter.hpp index b83d1b66..68a69cbb 100644 --- a/tools/converter/include/caffeConverter.hpp +++ b/tools/converter/include/caffeConverter.hpp @@ -9,7 +9,6 @@ #ifndef CAFFECONVERTER_HPP #define CAFFECONVERTER_HPP -#include "options.hpp" #include "MNN_generated.h" /** @@ -17,10 +16,9 @@ * @param prototxtFile prototxt file name * @param modelFile caffemodel file name * @param bizCode(not used, always is MNN) - * @param options(converter common options) * @param MNN net */ int caffe2MNNNet(const std::string prototxtFile, const std::string modelFile, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT); + std::unique_ptr& netT); #endif // CAFFECONVERTER_HPP diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp index e0d885a3..024edd3a 100644 --- a/tools/converter/include/config.hpp +++ b/tools/converter/include/config.hpp @@ -61,6 +61,7 @@ public: std::string compressionParamsFile = ""; bool saveStaticModel = false; int optimizePrefer = 0; + float targetVersion = 1.2; }; #endif // CONFIG_HPP diff --git a/tools/converter/include/liteConverter.hpp b/tools/converter/include/liteConverter.hpp index 1f7857dd..ac2ad94a 100644 --- a/tools/converter/include/liteConverter.hpp +++ b/tools/converter/include/liteConverter.hpp @@ -16,7 +16,6 @@ #include "flatbuffers/minireflect.h" #include "flatbuffers/util.h" -#include "options.hpp" // MNN fbs header #include "MNN_generated.h" // tflite fbs header @@ -45,6 +44,6 @@ private: * @param MNN net */ int tflite2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& MNNNetT); + std::unique_ptr& MNNNetT); #endif // LITECONVERTER_HPP diff --git a/tools/converter/include/onnxConverter.hpp b/tools/converter/include/onnxConverter.hpp index 1f117d68..c297adc6 100644 --- a/tools/converter/include/onnxConverter.hpp +++ b/tools/converter/include/onnxConverter.hpp @@ -9,7 +9,6 @@ #ifndef ONNXCONVERTER_HPP #define ONNXCONVERTER_HPP -#include "options.hpp" #include "MNN_generated.h" /** @@ -19,6 +18,6 @@ * @param MNN net */ int onnx2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT); + std::unique_ptr& netT); #endif // ONNXCONVERTER_HPP diff --git a/tools/converter/include/tensorflowConverter.hpp b/tools/converter/include/tensorflowConverter.hpp index 6bc89290..2c24cd56 100644 --- a/tools/converter/include/tensorflowConverter.hpp +++ b/tools/converter/include/tensorflowConverter.hpp @@ -11,17 +11,14 @@ #include -#include "options.hpp" #include "MNN_generated.h" /** * @brief convert tensorflow model to MNN model * @param inputModel tensorflow model name(xx.pb) * @param bizCode(not used, always is MNN) - * @param options(converter common options) * @param MNN net */ int tensorflow2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT); #endif // TENSORFLOWCONVERTER_HPP diff --git a/tools/converter/include/torchscriptConverter.hpp b/tools/converter/include/torchscriptConverter.hpp index 56bc29da..ee6444bb 100644 --- a/tools/converter/include/torchscriptConverter.hpp +++ b/tools/converter/include/torchscriptConverter.hpp @@ -9,7 +9,6 @@ #ifndef TORCHSCRIPTCONVERTER_HPP #define TORCHSCRIPTCONVERTER_HPP -#include "options.hpp" #include "MNN_generated.h" /** @@ -19,6 +18,6 @@ * @param MNN net */ int torchscript2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT); + std::unique_ptr& netT); #endif // TORCHSCRIPTCONVERTER_HPP diff --git a/tools/converter/source/MNN/addBizCode.cpp b/tools/converter/source/MNN/addBizCode.cpp index 4f44dae4..d7ba2fdf 100644 --- a/tools/converter/source/MNN/addBizCode.cpp +++ b/tools/converter/source/MNN/addBizCode.cpp @@ -10,10 +10,9 @@ #include #include #include "logkit.h" -#include "options.hpp" int addBizCode(const std::string modelFile, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT) { + std::unique_ptr& netT) { std::ifstream inputFile(modelFile, std::ios::binary); inputFile.seekg(0, std::ios::end); auto size = inputFile.tellg(); diff --git a/tools/converter/source/MNNConverter.cpp b/tools/converter/source/MNNConverter.cpp index bd3aab8c..1a1e636c 100644 --- a/tools/converter/source/MNNConverter.cpp +++ b/tools/converter/source/MNNConverter.cpp @@ -17,7 +17,6 @@ #include "tensorflowConverter.hpp" #include "torchscriptConverter.hpp" #include "writeFb.hpp" -#include "options.hpp" #include "common/Global.hpp" int main(int argc, char *argv[]) { @@ -29,23 +28,22 @@ int main(int argc, char *argv[]) { Cli::printProjectBanner(); Global::Reset(&modelPath); - auto options = common::BuildOptions(modelPath.compressionParamsFile); std::cout << "Start to Convert Other Model Format To MNN Model..." << std::endl; std::unique_ptr netT = std::unique_ptr(new MNN::NetT()); if (modelPath.model == modelConfig::CAFFE) { - caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, options, netT); + caffe2MNNNet(modelPath.prototxtFile, modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::TENSORFLOW) { - tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + tensorflow2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::MNN) { - addBizCode(modelPath.modelFile, modelPath.bizCode, options, netT); + addBizCode(modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::ONNX) { - onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + onnx2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); } else if (modelPath.model == modelConfig::TFLITE) { - tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + tflite2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); #ifdef MNN_BUILD_TORCHSCRIPT } else if (modelPath.model == modelConfig::TORCHSCRIPT) { - torchscript2MNNNet(modelPath.modelFile, modelPath.bizCode, options, netT); + torchscript2MNNNet(modelPath.modelFile, modelPath.bizCode, netT); #endif } else { std::cout << "Not Support Model Type" << std::endl; diff --git a/tools/converter/source/TestConvertResult.cpp b/tools/converter/source/TestConvertResult.cpp index 9d6129d0..6a66307f 100644 --- a/tools/converter/source/TestConvertResult.cpp +++ b/tools/converter/source/TestConvertResult.cpp @@ -16,7 +16,6 @@ #include #include "PostConverter.hpp" #include "rapidjson/document.h" -#include "options.hpp" #include #include #include @@ -70,7 +69,7 @@ static bool compareOutput(VARP output, const std::string& directName, const std: } int main(int argc, char *argv[]) { if (argc < 3) { - MNN_ERROR("Usage: ./TestConvertResult [Onnx, Tf] ${Dir}\n"); + MNN_ERROR("Usage: ./TestConvertResult [Onnx, Tf, Tflite] ${Dir}\n"); return 0; } std::string inputType = argv[1]; @@ -84,6 +83,11 @@ int main(int argc, char *argv[]) { converter = tensorflow2MNNNet; suffix = ".pb"; dataFormat = NHWC; + } else if (inputType == "Tflite") { + inputModel = modelConfig::TFLITE; + converter = tflite2MNNNet; + suffix = ".tflite"; + dataFormat = NHWC; } MNN_PRINT("Test %s\n", directName.c_str()); std::string defaultCacheFile = ".___temp.mnn"; @@ -92,11 +96,10 @@ int main(int argc, char *argv[]) { modelPath.model = inputModel; Global::Reset(&modelPath); - auto options = common::DefaultOptions(); std::ostringstream modelNameOs; modelNameOs << directName << "/test" << suffix; std::unique_ptr netT = std::unique_ptr(new MNN::NetT()); - converter(modelNameOs.str().c_str(), "Test", options, netT); + converter(modelNameOs.str().c_str(), "Test", netT); std::unique_ptr newNet = optimizeNet(netT, false); flatbuffers::FlatBufferBuilder builderOutput(1024); builderOutput.ForceDefaults(true); diff --git a/tools/converter/source/caffe/Convolution.cpp b/tools/converter/source/caffe/Convolution.cpp index e7860c2a..87692833 100644 --- a/tools/converter/source/caffe/Convolution.cpp +++ b/tools/converter/source/caffe/Convolution.cpp @@ -219,3 +219,4 @@ public: }; static OpConverterRegister ab("ConvolutionDepthwise"); +static OpConverterRegister ab2("DepthwiseConv"); diff --git a/tools/converter/source/caffe/ResizeInterp.cpp b/tools/converter/source/caffe/ResizeInterp.cpp index 4a5fe8c7..14da080c 100644 --- a/tools/converter/source/caffe/ResizeInterp.cpp +++ b/tools/converter/source/caffe/ResizeInterp.cpp @@ -8,6 +8,30 @@ #include "OpConverter.hpp" +class Upsample : public OpConverter { +public: + virtual void run(MNN::OpT* dstOp, const caffe::LayerParameter& parameters, const caffe::LayerParameter& weight); + Upsample() { + } + virtual ~Upsample() { + } + virtual MNN::OpType opType() { + return MNN::OpType_Resize; + } + virtual MNN::OpParameter type() { + return MNN::OpParameter_Resize; + } +}; + +void Upsample::run(MNN::OpT* dstOp, const caffe::LayerParameter& parameters, const caffe::LayerParameter& weight) { + auto resize = new MNN::ResizeT; + dstOp->main.value = resize; + auto& r = parameters.upsample_param(); + resize->xScale = r.scale(); + resize->yScale = r.scale(); +} +static OpConverterRegister ___a("Upsample"); + class Resize : public OpConverter { public: virtual void run(MNN::OpT* dstOp, const caffe::LayerParameter& parameters, const caffe::LayerParameter& weight); diff --git a/tools/converter/source/caffe/caffe.proto b/tools/converter/source/caffe/caffe.proto index eb93bf1d..9a894910 100644 --- a/tools/converter/source/caffe/caffe.proto +++ b/tools/converter/source/caffe/caffe.proto @@ -568,6 +568,7 @@ message LayerParameter { optional InterpParameter interp_param = 2210; optional ROIPoolingParameter roi_pooling_param = 2201; optional ClipParameter clip_param = 2202; + optional UpsampleParameter upsample_param = 2203; } // Message that stores parameters used by ClipLayer @@ -2093,3 +2094,7 @@ message ShuffleChannelParameter { // for Mobile Devices" optional uint32 group = 1[default = 1]; // The number of group } + +message UpsampleParameter { + optional float scale = 1; +} diff --git a/tools/converter/source/caffe/caffeConverter.cpp b/tools/converter/source/caffe/caffeConverter.cpp index 81d2fba7..688ff9a3 100644 --- a/tools/converter/source/caffe/caffeConverter.cpp +++ b/tools/converter/source/caffe/caffeConverter.cpp @@ -18,10 +18,9 @@ #include "CaffeUtils.hpp" #include "caffeConverter.hpp" -#include "options.hpp" int caffe2MNNNet(const std::string prototxtFile, const std::string modelFile, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT) { + std::unique_ptr& netT) { caffe::NetParameter caffeProtxt; caffe::NetParameter caffeModel; bool succ = read_proto_from_text(prototxtFile.c_str(), &caffeProtxt); diff --git a/tools/converter/source/cli.cpp b/tools/converter/source/cli.cpp index 0df78fd5..f5f9ebc1 100644 --- a/tools/converter/source/cli.cpp +++ b/tools/converter/source/cli.cpp @@ -16,6 +16,7 @@ #endif #include "config.hpp" #include "logkit.h" +#include /** * Print Command Line Banner @@ -59,6 +60,7 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, "weight scales and zero points for quantization or information " "for sparsity.", cxxopts::value())( "saveStaticModel", "save static model with fix shape, default: false", cxxopts::value())( + "targetVersion", "compability for old mnn engine, default: 1.2f", cxxopts::value())( "inputConfigFile", "set input config file for static model, ex: ~/config.txt", cxxopts::value()); auto result = options.parse(argc, argv); @@ -140,7 +142,11 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, std::cout << options.help({""}) << std::endl; exit(EXIT_FAILURE); } - + if (result.count("targetVersion")) { + auto version = result["targetVersion"].as(); + std::cout << "TargetVersion is " << version << std::endl; + modelPath.targetVersion = version; + } // add MNN bizCode if (result.count("bizCode")) { const std::string bizCode = result["bizCode"].as(); diff --git a/tools/converter/source/common/writeFb.cpp b/tools/converter/source/common/writeFb.cpp index df0cbfea..a56e4b74 100644 --- a/tools/converter/source/common/writeFb.cpp +++ b/tools/converter/source/common/writeFb.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "MNN_generated.h" #include "half.hpp" @@ -19,8 +20,12 @@ #include #include "cli.hpp" #include "../../common/Global.hpp" +#include "MNN_compression.pb.h" +#include "MNN/expr/ExprCreator.hpp" +#include "cpp/IDSTEncoder.hpp" using namespace MNN; +using namespace MNN::Express; using namespace std; static float findAbsMax(const float *weights, const int count) { @@ -52,367 +57,6 @@ static std::vector findMinMax(const float *weights, const int count) { return {min, max}; } -static void WriteBlobDim(ostream &out, std::vector dims) -{ - char tmp[4]; - ((unsigned char *)tmp)[0] = (unsigned char)dims.size(); - out.write(tmp, 1); - for (int i = 0; i < dims.size(); i++) - { - unsigned short tmpShort = (unsigned short)dims[i]; - out.write((const char*)(&tmpShort), 2); - } -} - -static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits) -{ - memset(buf, 0, buf_len); - char *tmp = buf; - int iOffset = 0; - unsigned char cMask = (1 << iNeedBits) - 1; - for (int i = 0; i < arr_len; i++) - { - char value = arr[i]; - int uShift = 8 - iNeedBits - iOffset % 8; - if (uShift < 0) - { - tmp[iOffset / 8] |= ((value & cMask) >> (0 - uShift)); - tmp[(iOffset / 8) + 1] |= ((value & cMask) << (8 + uShift)); - } - else - { - tmp[iOffset / 8] |= ((value & cMask) << uShift); - } - iOffset += iNeedBits; - if (iOffset % 8 == 0) - { - tmp += iOffset / 8; - iOffset = 0; - } - } -} - -static void GetWeightSet(set &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) -{ - setWeight.clear(); - if (asymmetricQuantFlag) { - for (int i = 0; i < channel; i++) - { - float min = alphaData[2*i]; - float alpha = alphaData[2*i+1]; - if (alpha <= 1e-6f) - { - setWeight.insert(-128); - continue; - } - for (int j = 0; j < area; j++) - { - float weight = weightData[i * area + j]; - setWeight.insert(round((weight - min) / alpha) + (-128)); - } - } - } else { - for (int i = 0; i < channel; i++) - { - float alpha = alphaData[i]; - if (alpha <= 1e-6f) - { - setWeight.insert(0); - continue; - } - for (int j = 0; j < area; j++) - { - float weight = weightData[i * area + j]; - setWeight.insert(round(weight / alpha)); - } - } - } -} - -static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1) -{ - nnz = 0; - int iPreIdx = 0; - float sparsity; - if (asymmetricQuantFlag) { - for (int i = 0; i < weightSize; i++) - { - float min = alphaData[2*(i/area)]; - float alpha = alphaData[2*(i/area)+1]; - int zeroQuant = -128; - if (alpha > 1e-6) { - zeroQuant = round((0.0f - min) / alpha) + (-128); - } - - float weight = weightData[i]; - int value = -128; - if (alpha > 1e-6) - { - value = round((weight - min) / alpha) + (-128); - } - - if (value != zeroQuant) - { - nnz++; - iPreIdx = i; - } - if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1)) - { - nnz++; - iPreIdx = i; - } - } - } else { - for (int i = 0; i < weightSize; i++) - { - float alpha = alphaData[i / area]; - float weight = weightData[i]; - int value = 0; - if (alpha > 1e-6f) - { - value = round(weight / alpha); - } - - if (value != 0) - { - nnz++; - iPreIdx = i; - } - if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1)) - { - nnz++; - iPreIdx = i; - } - } - } - sparsity = 1 - 1.0f * nnz / weightSize; - return sparsity; -} - -unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) -{ - size_t szBestSize = 1000000000; - unsigned int best_nnz = 0; - for (int i = 2; i < 9; i++) - { - unsigned int nnz = 0; - GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1); - size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize); - if (tmp < szBestSize) - { - iMaxStepBits = (unsigned char) i; - szBestSize = tmp; - best_nnz = nnz; - } - } - return best_nnz; -} - -static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) -{ - //push values into buffer - //Find int values in all blobs and check; - set setWeight; - GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag); - int iCount = setWeight.size(); - int iNeedBits = ceil(log2(iCount)); - if (iNeedBits > 8) { - MNN_ERROR("The Bits need large than 8, the model may be error for user\n"); - return; - } - map mapWeight; - int iIdx = 0; - for (set::iterator it = setWeight.begin(); it != setWeight.end(); it++) - { - mapWeight[*it] = iIdx++; - } - size_t buf_len = size_t(ceil(0.125 * iNeedBits * area * channel)); - char *buf = new char[buf_len]; - { - char *arr = new char[area * channel]; - char *tmp = arr; - if (asymmetricQuantFlag) { - for (int i = 0; i < channel; i++) - { - float min = alphaData[2*i]; - float alpha = alphaData[2*i+1]; - for (int j = 0; j < area; j++) - { - float weight = weightData[i * area + j]; - int value = -128; - if (alpha > 1e-6f) - { - value = round((weight - min) / alpha) + (-128); - } - *tmp = mapWeight[value]; - tmp++; - } - } - } else { - for (int i = 0; i < channel; i++) - { - float alpha = alphaData[i]; - for (int j = 0; j < area; j++) - { - float weight = weightData[i * area + j]; - int value = 0; - if (alpha > 1e-6f) - { - value = round(weight / alpha); - } - *tmp = mapWeight[value]; - tmp++; - } - } - } - FillBuffer(buf, buf_len, arr, area * channel, iNeedBits); - delete[] arr; - } - //begin write to file - { - char tmp[100]; - //1. weights blob shape(unsigned int32) - WriteBlobDim(out, {channel, area}); - // 2. Avalable values Count(unsigned char) - tmp[0] = (unsigned char)iCount; - out.write(tmp, 1); - // 3. valueset(signed char * valueset_size) - for (set::iterator it = setWeight.begin(); it != setWeight.end(); it++) - { - tmp[0] = (unsigned char)*it; - out.write(tmp, 1); - } - // 4. weights indexes(size = ceil(0.125*weights_count*ceil(log2(Avalable_values_Count)))) - out.write(buf, buf_len); - //g_totalSize += 1 + setWeight.size() + buf_len; - } - delete[] buf; -} - -static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) -{ - set setWeight; - GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag); - int iDataNeedBits = ceil(log2(setWeight.size())); - unsigned int nnz = 0; - int weightSize = area * channel; - map mapWeight; - { - int iIdx = 0; - for (set::iterator it = setWeight.begin(); it != setWeight.end(); it++) - { - mapWeight[*it] = iIdx++; - } - } - unsigned char iNeedBits; - nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag); - //weight buf - size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz)); - char* data_buf = new char[data_buf_len]; - //sparse COO buf - size_t buf_len = size_t(ceil(0.125 * iNeedBits * nnz)); - char* buf = new char[buf_len]; - { //fill buf with step values; - unsigned char* arr_idx = new unsigned char[nnz]; - unsigned char* data_arr = new unsigned char[nnz]; - unsigned char* tmp = arr_idx; - int iMaxStep = pow(2, iNeedBits) - 1; - int iPreIdx = 0; - unsigned char* dTmp = data_arr; - if (asymmetricQuantFlag) { - for (int i = 0; i < weightSize; i++) - { - float min = alphaData[2*(i/area)]; - float alpha = alphaData[2*(i/area)+1]; - int zeroQuant = -128; - if (alpha > 1e-6) { - zeroQuant = round((0.0f - min) / alpha) + (-128); - } - - float weight = weightData[i]; - int value = -128; - if (alpha > 1e-6) - { - value = round((weight - min) / alpha) + (-128); - } - - if (value != zeroQuant) - { - *dTmp = mapWeight[value]; - *tmp = i - iPreIdx; - iPreIdx = i; - tmp++; - dTmp++; - } - if (i - iPreIdx >= iMaxStep) - { - *dTmp = mapWeight[zeroQuant]; - *tmp = i - iPreIdx; - iPreIdx = i; - tmp++; - dTmp++; - } - } - } else { - for (int i = 0; i < weightSize; i++) - { - float alpha = alphaData[i / area]; - float weight = weightData[i]; - int value = 0; - if (alpha > 1e-6f) - { - value = round(weight / alpha); - } - - if (value != 0) - { - *dTmp = mapWeight[value]; - *tmp = i - iPreIdx; - iPreIdx = i; - tmp++; - dTmp++; - } - if (i - iPreIdx >= iMaxStep) - { - *dTmp = mapWeight[0]; - *tmp = i - iPreIdx; - iPreIdx = i; - tmp++; - dTmp++; - } - } - } - FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits); - FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits); - delete[] arr_idx; - delete[] data_arr; - } - { //write - char tmp[100]; - // 1.weights blob shape(unsigned int32) - WriteBlobDim(out, {channel, area}); - // 2. nnz - out.write((const char*) &nnz, 4); - // 3. max_step use # bits () (unsigned char) - out.write((const char*) &iNeedBits, 1); - // 4. buf for steps ceil(nnz*step need bits/8) - out.write(buf, buf_len); - // 5. Avalable values Count(unsigned char) - tmp[0] = (unsigned char) setWeight.size(); - out.write(tmp, 1); - // 6. valueset(signed char * valueset_size) - for (set::iterator it = setWeight.begin(); it != setWeight.end(); it++) - { - tmp[0] = (unsigned char) *it; - out.write(tmp, 1); - } - // 7. none zero weights indexes(nnz*ceil(log2(Avalable_values_Count))/8) - out.write((const char*) data_buf, data_buf_len); - } - delete[] buf; - delete[] data_buf; -} - int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, modelConfig config) { auto RemoveParams = [](std::unique_ptr& op) { const auto opType = op->type; @@ -522,6 +166,161 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m } } + auto FullQuantAndCoding = [&](std::unique_ptr& op, Compression::Pipeline& proto, SubGraphProtoT* subgraph) { + std::string outputTensorName = subgraph ? subgraph->tensors[op->outputIndexes[0]] : netT->tensorName[op->outputIndexes[0]];; + auto opType = op->type; + if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise) { + return; + } + + auto findQuantParameters = [&](Compression::Pipeline& proto, std::string outputTensorName) { + for (const auto& algo : proto.algo()) { + if (algo.type() == Compression::CompressionAlgo::QUANTIZE) { + auto quantParams = algo.quant_params(); + for (const auto& layerProto : quantParams.layer()) { + const std::string& outputName = layerProto.output(0).name(); + if (outputName == outputTensorName) { + return layerProto; + } + } + } + } + MNN::Compression::LayerQuantizeParams empty; + return empty; + }; + + auto inputIndex = op->inputIndexes[0]; + int outputIndex = op->outputIndexes[0]; + auto quantParams = findQuantParameters(proto, outputTensorName); + if (quantParams.weight_size() == 0) { + return; + } + + auto inputParams = quantParams.input(0); + auto outputParams = quantParams.output(0); + auto weightParams = quantParams.weight(0); + auto& tensorDescribe = subgraph ? subgraph->extraTensorDescribe : netT->extraTensorDescribe; + + std::unique_ptr inDescribe(new MNN::TensorDescribeT); + inDescribe->index = inputIndex; + std::unique_ptr inputQuantInfo(new MNN::TensorQuantInfoT); + inputQuantInfo->zero = inputParams.zero_point(); + inputQuantInfo->scale = inputParams.scales(0); + inputQuantInfo->min = inputParams.clamp_min(); + inputQuantInfo->max = inputParams.clamp_max(); + inputQuantInfo->type = MNN::DataType_DT_INT8; + inDescribe->quantInfo = std::move(inputQuantInfo); + tensorDescribe.emplace_back(std::move(inDescribe)); + + std::unique_ptr outDescribe(new MNN::TensorDescribeT); + outDescribe->index = outputIndex; + std::unique_ptr outputQuantInfo(new MNN::TensorQuantInfoT); + outputQuantInfo->zero = outputParams.zero_point(); + // outputQuantInfo->scale = 1.f / outputParams.scales(0); + outputQuantInfo->scale = outputParams.scales(0); + outputQuantInfo->min = outputParams.clamp_min(); + outputQuantInfo->max = outputParams.clamp_max(); + outputQuantInfo->type = MNN::DataType_DT_INT8; + outDescribe->quantInfo = std::move(outputQuantInfo); + tensorDescribe.emplace_back(std::move(outDescribe)); + + + auto convParams = op->main.AsConvolution2D(); + auto weightFloat = convParams->weight; + auto biasFloat = convParams->bias; + auto& common = convParams->common; + + const int ko = common->outputCount; + const int ki = common->inputCount / common->group; + const int kh = common->kernelY; + const int kw = common->kernelX; + const int kernelNum = common->outputCount; + const int kernelSize = weightFloat.size() / kernelNum; + + VARP weightVar = _Const(weightFloat.data(), {ko, ki, kh, kw}, NCHW); + VARP biasVar = _Const(biasFloat.data(), {ko, 1, 1, 1}, NCHW); + VARP inputScaleVar = _Const(inputParams.scales(0), {}, NCHW); + VARP outputScaleVar = _Const(outputParams.scales(0), {}, NCHW); + + float wClampMin = weightParams.clamp_min(); + float wClampMax = weightParams.clamp_max(); + + std::vector weightScaleVector(weightParams.scales().begin(), weightParams.scales().end()); + VARP weightScale = _Const(weightScaleVector.data(), {(int)weightScaleVector.size(), 1, 1, 1}, NCHW, halide_type_of()); + auto quanWeightTemp = _Round(weightVar * _Reciprocal(weightScale)); + auto quanWeightClamp = MNN::Express::_Maximum(_Minimum(quanWeightTemp, _Scalar(wClampMax)), _Scalar(wClampMin)); + auto quanWeight = _Cast(quanWeightClamp); + auto convScale = _Reshape(_Reciprocal(outputScaleVar), {-1, 1, 1, 1}) * weightScale * inputScaleVar; + + auto remains = _ReduceSum(_Scalar(inputParams.zero_point()) * _Cast(quanWeight), {1, 2, 3}, true); + auto outputZeroPointFused = _Cast(_Scalar(outputParams.zero_point()) * _Reciprocal(convScale)); + auto quanBias = _Cast(biasVar * _Reciprocal(weightScale * inputScaleVar)) - remains + outputZeroPointFused; + auto deQuantBias = _Cast(quanBias) * (weightScale * inputScaleVar); + + std::vector quantWeightFloat; + std::vector quantWeights; + std::vector biasData; + std::vector scale; + + { + auto info = quanWeight->getInfo(); + quantWeights.resize(info->size); + quantWeightFloat.resize(info->size); + auto ptr = quanWeight->readMap(); + for (int i = 0; i < quantWeightFloat.size(); i++) { + quantWeightFloat[i] = ptr[i]; + quantWeights[i] = ptr[i]; + } + } + { + auto biasinfo = deQuantBias->getInfo(); + biasData.resize(biasinfo->size); + auto ptr = deQuantBias->readMap(); + ::memcpy(biasData.data(), ptr, biasData.size() * sizeof(int32_t)); + + auto info = weightScale->getInfo(); + scale.resize(info->size); + MNN_ASSERT(scale.size() == biasData.size()); + auto ptrScale = weightScale->readMap(); + ::memcpy(scale.data(), ptrScale, scale.size() * sizeof(float)); + } + + bool asymmetricQuantFlag = false; + std::vector fakeScales(kernelNum, 1.0f); + convParams->quanParameter = IDSTEncoder::encode(quantWeightFloat, fakeScales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), wClampMin); + convParams->weight.clear(); + convParams->quanParameter->alpha = std::move(scale); + convParams->quanParameter->scaleIn = inputParams.scales(0); + convParams->quanParameter->scaleOut = outputParams.scales(0); + + convParams->symmetricQuan.reset(new MNN::QuantizedFloatParamT); + convParams->symmetricQuan->method = MNN::QuantizeAlgo(int(quantParams.method())); + convParams->symmetricQuan->nbits = outputParams.bits(); + + convParams->bias = std::move(biasData); + }; + + { + auto gConverterConfig = Global::Get(); + std::string compressFileName = gConverterConfig->compressionParamsFile; + if (compressFileName != "") { + Compression::Pipeline proto; + std::fstream input(compressFileName.c_str(), std::ios::in | std::ios::binary); + if (!proto.ParseFromIstream(&input)) { + MNN_ERROR("Failed to parse compression pipeline proto.\n"); + } + + for (auto& op : netT->oplists) { + FullQuantAndCoding(op, proto, nullptr); + } + for (auto& subgraph : netT->subgraphs) { + for (auto& op : subgraph->nodes) { + FullQuantAndCoding(op, proto, subgraph.get()); + } + } + } + } + auto WeightQuantAndCoding = [&](std::unique_ptr& op) { const auto opType = op->type; // config.weightQuantBits only control weight quantization for float convolution @@ -562,14 +361,19 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m auto gConverterConfig = Global::Get(); bool asymmetricQuantFlag = gConverterConfig->weightQuantAsymmetric; + float threshold = (float)(1 << (bits - 1)) - 1.0f; + float clampMin = -threshold; + if (asymmetricQuantFlag) { + clampMin = -threshold - 1; + } std::vector weightData, scales; + std::vector quantWeights; switch (opType) { case MNN::OpType_Convolution: case MNN::OpType_ConvolutionDepthwise: case MNN::OpType_Deconvolution: case MNN::OpType_DeconvolutionDepthwise: { - float thredhold = (float)(1 << (bits - 1)) - 1.0f; weightData = param->weight; if (asymmetricQuantFlag) { @@ -579,10 +383,16 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m auto minAndMax = findMinMax(weightData.data() + beginIndex, kernelSize); float min = minAndMax[0]; float max = minAndMax[1]; - float scale = (max - min) / (127 + 128); + float scale = (max - min) / (threshold - clampMin); scales[2*k] = min; scales[2*k+1] = scale; + + for (int ii = 0; ii < kernelSize; ii++) { + float* ptr = weightData.data() + beginIndex; + int8_t quantValue = int8_t(std::round((ptr[ii] - min) / scale + clampMin)); + quantWeights.emplace_back(quantValue); + } } } else { scales.resize(kernelNum); @@ -590,7 +400,13 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m int beginIndex = k * kernelSize; auto absMax = findAbsMax(weightData.data() + beginIndex, kernelSize); - scales[k] = absMax / thredhold; + scales[k] = absMax / threshold; + + for (int ii = 0; ii < kernelSize; ii++) { + float* ptr = weightData.data() + beginIndex; + int8_t quantValue = int8_t(std::round(ptr[ii] / scales[k])); + quantWeights.emplace_back(quantValue); + } } } @@ -602,11 +418,7 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m for (int i = 0; i < int8Params->weight.size(); i++) { weightData.emplace_back(float(int8Params->weight[i])); } - scales.resize(kernelNum, 1.0f); - if (asymmetricQuantFlag) { - scales.resize(kernelNum*2, 1.0f); - } break; } @@ -614,39 +426,13 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m break; } - std::ostringstream outputStringStreamCQ, outputStringStreamSQ; - WriteCQBlobs(outputStringStreamCQ, weightData.data(), scales.data(), kernelSize, kernelNum, asymmetricQuantFlag); - WriteSparseQuanBlobs(outputStringStreamSQ, weightData.data(), scales.data(), kernelSize, kernelNum, asymmetricQuantFlag); - - if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) { - if (weightSize < (outputStringStreamCQ.str().size() + sizeof(float)) && weightSize < (outputStringStreamSQ.str().size() + sizeof(float))) { - return; // only encode when it is smaller - } - } - - param->quanParameter.reset(new MNN::IDSTQuanT); - auto tempString = outputStringStreamCQ.str(); - param->quanParameter->type = 1; - if (outputStringStreamSQ.str().size() < tempString.size()) { - tempString = outputStringStreamSQ.str(); - param->quanParameter->type = 2; - } - - param->quanParameter->buffer.resize(tempString.size()); - ::memcpy(param->quanParameter->buffer.data(), tempString.data(), tempString.size()); - - param->quanParameter->quantScale = 1.0f; - if (asymmetricQuantFlag) { - param->quanParameter->readType = kernelNum; - } - if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) { + param->quanParameter = IDSTEncoder::encode(weightData, scales, kernelSize, kernelNum, false, param->symmetricQuan->weight.data(), int(clampMin)); param->symmetricQuan->weight.clear(); param->quanParameter->alpha = {1.0f}; // fake scales - param->quanParameter->has_scaleInt = true; } else { + param->quanParameter = IDSTEncoder::encode(weightData, scales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), int(clampMin)); param->weight.clear(); - param->quanParameter->alpha = std::move(scales); } }; @@ -687,6 +473,32 @@ int writeFb(std::unique_ptr& netT, const std::string& MNNModelFile, m LOG(FATAL) << "These Op Not Support: " << opNames.substr(0, opNames.size() - 2); } + // dump input and output tensor name + { + std::set inputIdx, outputIdx, realInput, realOutput; + for (const auto& op : netT->oplists) { + for (auto i : op->inputIndexes) { + inputIdx.insert(i); + } + for (auto o : op->outputIndexes) { + outputIdx.insert(o); + if (op->type == OpType_Input) { + realInput.insert(o); + } + } + } + std::set_difference(outputIdx.begin(), outputIdx.end(), inputIdx.begin(), inputIdx.end(), std::inserter(realOutput, realOutput.begin())); + std::cout << "inputTensors : [ "; + for (int i : realInput) { + std::cout << netT->tensorName[i] << ", "; + } + std::cout << "]\noutputTensors: [ "; + for (int i : realOutput) { + std::cout << netT->tensorName[i] << ", "; + } + std::cout << "]" << std::endl; + } + flatbuffers::FlatBufferBuilder builderOutput(1024); builderOutput.ForceDefaults(true); auto len = MNN::Net::Pack(builderOutput, netT.get()); diff --git a/tools/converter/source/compression/CMakeLists.txt b/tools/converter/source/compression/CMakeLists.txt index 1b1f65a9..f7cef15c 100644 --- a/tools/converter/source/compression/CMakeLists.txt +++ b/tools/converter/source/compression/CMakeLists.txt @@ -4,6 +4,6 @@ protobuf_generate_cpp(MNN_COMPRESSION_PROTO_SRCS MNN_COMPRESSION_PROTO_HDRS file(GLOB CALIBRATION_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp) add_library(MNNCompress OBJECT ${CALIBRATION_SRC} ${MNN_COMPRESSION_PROTO_SRCS}) -target_include_directories(MNNCompress PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/) +target_include_directories(MNNCompress PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/) list(APPEND MNN_CONVERTER_BACKENDS_OBJECTS $) list(APPEND MNN_CONVERTER_BACKENDS_TARGETS MNNCompress) diff --git a/tools/converter/source/onnx/GridSampleOnnx.cpp b/tools/converter/source/onnx/GridSampleOnnx.cpp new file mode 100644 index 00000000..65bd0c6b --- /dev/null +++ b/tools/converter/source/onnx/GridSampleOnnx.cpp @@ -0,0 +1,73 @@ +// +// GridSampleOnnx.cpp +// MNNConverter +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "onnxOpConverter.hpp" + +DECLARE_OP_CONVERTER(GridSampleOnnx); + +MNN::OpType GridSampleOnnx::opType(){ + return MNN::OpType_GridSample; +} + +MNN::OpParameter GridSampleOnnx::type(){ + return MNN::OpParameter_GridSample; +} + +void GridSampleOnnx::run(MNN::OpT *dstOp, const onnx::NodeProto *onnxNode, std::vector initializers){ + + auto gridSampleParam = new MNN::GridSampleT; + + gridSampleParam->mode = MNN::SampleMode_BILINEAR; + gridSampleParam->paddingMode = MNN::BorderMode_ZEROS; + gridSampleParam->alignCorners = false; + for (int i = 0; i < onnxNode->attribute_size(); ++i) { + const auto &attributeProto = onnxNode->attribute(i); + const auto &attributeName = attributeProto.name(); + if (attributeName == "mode") { + switch (attributeProto.i()) { + case 0: + gridSampleParam->mode = MNN::SampleMode_BILINEAR; + break; + case 1: + gridSampleParam->mode = MNN::SampleMode_NEAREST; + break; + default: + LOG(FATAL) << "Unknown mode for " << onnxNode->name() << "!"; + break; + } + } + if (attributeName == "padding_mode") { + switch (attributeProto.i()) { + case 0: + gridSampleParam->paddingMode = MNN::BorderMode_ZEROS; + break; + case 1: + gridSampleParam->paddingMode = MNN::BorderMode_CLAMP; + break; + case 2: + gridSampleParam->paddingMode = MNN::BorderMode_REFLECTION; + break; + default: + LOG(FATAL) << "Unknown padding mode for " << onnxNode->name() << "!"; + break; + } + } + if (attributeName == "align_corners") { + gridSampleParam->alignCorners = attributeProto.i(); + } + } + + dstOp->main.value = gridSampleParam; +} + +// REGISTER_CONVERTER(GridSampleOnnx, GridSample); + +// When we export torch.nn.functional.grid_sample to onnx, it's called GridSampler rather than GridSample, +// thus, we have to add the "r" +#define REGISTER_CONVERTER_r(name, opType) static onnxOpConverterRegister _Convert_##opType(#opType"r") +REGISTER_CONVERTER_r(GridSampleOnnx, GridSample); diff --git a/tools/converter/source/onnx/onnxConverter.cpp b/tools/converter/source/onnx/onnxConverter.cpp index cdc95bec..061b0e74 100644 --- a/tools/converter/source/onnx/onnxConverter.cpp +++ b/tools/converter/source/onnx/onnxConverter.cpp @@ -19,10 +19,9 @@ #include "onnx.pb.h" #include "onnxConverter.hpp" #include "onnxOpConverter.hpp" -#include "options.hpp" int onnx2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT) { + std::unique_ptr& netT) { onnx::ModelProto onnxModel; // read ONNX Model bool success = onnx_read_proto_from_binary(inputModel.c_str(), &onnxModel); diff --git a/tools/converter/source/optimizer/CMakeLists.txt b/tools/converter/source/optimizer/CMakeLists.txt index 123e439b..8dbeafd6 100644 --- a/tools/converter/source/optimizer/CMakeLists.txt +++ b/tools/converter/source/optimizer/CMakeLists.txt @@ -1,4 +1,5 @@ file(GLOB_RECURSE OPTIMIZER_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp) add_library(MNNConverterOpt OBJECT ${OPTIMIZER_SRC}) +target_link_libraries(MNNConverterOpt PRIVATE MNNCompress) list(APPEND MNN_CONVERTER_BACKENDS_OBJECTS $) list(APPEND MNN_CONVERTER_BACKENDS_TARGETS MNNConverterOpt) diff --git a/tools/converter/source/optimizer/GenerateSubGraph.cpp b/tools/converter/source/optimizer/GenerateSubGraph.cpp index 1aff2983..099b18d3 100644 --- a/tools/converter/source/optimizer/GenerateSubGraph.cpp +++ b/tools/converter/source/optimizer/GenerateSubGraph.cpp @@ -377,6 +377,7 @@ std::vector> _makeWhile(std::shared_ptr cNode, whileParam->cond_graph = condGraph->name; whileParam->body_graph = bodyGraph->name; + std::set extraInputIndexes; // Remove Merge and find body std::vector bodyUpdate; std::set bodyOutputNames; @@ -384,12 +385,29 @@ std::vector> _makeWhile(std::shared_ptr cNode, std::vector> updateIndexes; auto childs = std::move(cNode->nodes); std::map replaceTensor; + std::set updateToTensors; for (auto& op : childs) { if (op->type == OpType_Extra && op->main.AsExtra()->type == "Merge") { - updateIndexes.emplace_back(std::make_pair(op->inputIndexes[1], op->inputIndexes[0])); - replaceTensor.insert(std::make_pair(op->outputIndexes[0], op->inputIndexes[0])); - bodyUpdate.emplace_back(op->inputIndexes[1]); - bodyOutputNames.insert(netT->tensorName[op->inputIndexes[1]]); + int updateFromIdx = op->inputIndexes[1], updateToIdx = op->inputIndexes[0]; + // if tensor_x is at outside of loop and used by two op, and these two op + // are all update data, so need copy tensor_x to tensor_x_copy. + if (updateToTensors.find(updateToIdx) != updateToTensors.end()) { + std::unique_ptr copyOp(new OpT); + copyOp->type = OpType_Concat; + copyOp->inputIndexes.push_back(updateToIdx); + auto opName = netT->tensorName[updateToIdx] + "_copy"; + updateToIdx = netT->tensorName.size(); + copyOp->outputIndexes.push_back(updateToIdx); + netT->tensorName.push_back(opName); + netT->tensorNumber++; + res.emplace_back(std::move(copyOp)); + extraInputIndexes.insert(updateToIdx); + } + updateToTensors.insert(updateToIdx); + updateIndexes.emplace_back(std::make_pair(updateFromIdx, updateToIdx)); + replaceTensor.insert(std::make_pair(op->outputIndexes[0], updateToIdx)); + bodyUpdate.emplace_back(updateFromIdx); + bodyOutputNames.insert(netT->tensorName[updateFromIdx]); continue; } cNode->nodes.emplace_back(std::move(op)); @@ -432,7 +450,6 @@ std::vector> _makeWhile(std::shared_ptr cNode, // Create Loop Cond std::set invalidSet; std::vector inputIndexes; - std::set extraInputIndexes; for (auto& node : cNode->nodes) { Express::Program::createUnit(varMap, inputIndexes, cNode->nodes, node.get(), netT, invalidSet, extraInputIndexes); } diff --git a/tools/converter/source/optimizer/PostConverter.cpp b/tools/converter/source/optimizer/PostConverter.cpp index 5835eba0..f0fb80aa 100644 --- a/tools/converter/source/optimizer/PostConverter.cpp +++ b/tools/converter/source/optimizer/PostConverter.cpp @@ -58,6 +58,7 @@ bool CompleteSubGraph(const std::unordered_map& inputs, const subnet->oplists = std::move(mutable_subgraph->nodes); subnet->tensorName = mutable_subgraph->tensors; subnet->sourceType = ctx->source; + subnet->outputName = outputNames; std::unique_ptr new_subnet = ctx->RunOptimize(subnet, inputs); mutable_subgraph->nodes = std::move(subnet->oplists); @@ -177,6 +178,9 @@ std::unique_ptr optimizeNetImpl(std::unique_ptr& originNet // Remove Dropout, if `forTraining` flag is set, Dropout will be reserved "RemoveDropout", + // Remove Dup op + "FuseDupOp", + // Turn InnerProduct from Caffe / Onnx to Convolution "TransformInnerProduct", @@ -493,10 +497,19 @@ std::unique_ptr optimizeNet(std::unique_ptr& originNet, bo Global::Reset(&ctx); - std::unordered_map empty; - for (auto& subGraph : originNet->subgraphs) { - CompleteSubGraph(empty, subGraph.get()); + if (!originNet->subgraphs.empty()) { + std::unordered_map inputs; + auto program = Program::create(originNet.get(), true); + for (const auto& iter : program->vars()) { + if (iter.first < originNet->tensorName.size() && iter.first >= 0) { + inputs[originNet->tensorName[iter.first]] = iter.second; + } + } + for (auto& subGraph : originNet->subgraphs) { + CompleteSubGraph(inputs, subGraph.get()); + } } + std::unordered_map empty; std::unique_ptr net = ctx.RunOptimize(originNet, empty); fuseConstIntoSubgraph(net.get(), ctx.completed_subgraphs); for (auto* subgraph : ctx.completed_subgraphs) { diff --git a/tools/converter/source/optimizer/Program.hpp b/tools/converter/source/optimizer/Program.hpp index ca6b165f..ef1ec1d4 100644 --- a/tools/converter/source/optimizer/Program.hpp +++ b/tools/converter/source/optimizer/Program.hpp @@ -29,6 +29,9 @@ public: void input(const std::unordered_map& inputs); static void createUnit(std::map& varMap, std::vector& inputIndexes, const std::vector>& oplists, MNN::OpT* op, const MNN::NetT* net, std::set& invalidSet, std::set& extraInputIndexes); + const std::map& vars() const { + return mVars; + } private: Program() { } diff --git a/tools/converter/source/optimizer/merge/Conv1dSqueezeMove.cpp b/tools/converter/source/optimizer/merge/Conv1dSqueezeMove.cpp new file mode 100644 index 00000000..9e7f4fd1 --- /dev/null +++ b/tools/converter/source/optimizer/merge/Conv1dSqueezeMove.cpp @@ -0,0 +1,261 @@ +// +// Conv1dSqueezeMove.cpp +// MNNConverter +// +// Created by MNN on 2021/03/05. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "../TemplateMerge.hpp" +#include "MNN_generated.h" +#include "MergeHelpers.hpp" +#include "../../common/Global.hpp" +#include "cli.hpp" +#include "MNN_compression.pb.h" +#include + +namespace MNN { +namespace Express { + +enum Conv1dPostCases { + None, + BiasAdd, + Relu, + // don't need BiasAddRelu +}; + +auto getConv1dPostCase = [](EXPRP expr) { + auto noPost = Conv1dPostCases::None; + auto returnPost = noPost; + + if (nullptr == expr->get()) { + return noPost; + } + + auto opType = expr->get()->type(); + + auto gConverterConfig = Global::Get(); + std::string compressFileName = gConverterConfig->compressionParamsFile; + Compression::Pipeline proto; + if (compressFileName != "") { + std::fstream input(compressFileName.c_str(), std::ios::in | std::ios::binary); + if (!proto.ParseFromIstream(&input)) { + MNN_ERROR("Failed to parse compression pipeline proto.\n"); + } + } + + auto findQuantParameters = [&](Compression::Pipeline& proto, std::string outputTensorName) { + for (const auto& algo : proto.algo()) { + if (algo.type() == Compression::CompressionAlgo::QUANTIZE) { + auto quantParams = algo.quant_params(); + for (const auto& layerProto : quantParams.layer()) { + const std::string& outputName = layerProto.output(0).name(); + if (outputName == outputTensorName) { + return layerProto; + } + } + } + } + MNN::Compression::LayerQuantizeParams empty; + return empty; + }; + + EXPRP squeezeExpr = nullptr; + + // BiasAdd + if (opType == OpType::OpType_BinaryOp) { + auto binaryOp = expr->get(); + auto binaryParams = binaryOp->main_as_BinaryOp(); + if (binaryParams->opType() != BinaryOpOperation_ADD) { + return noPost; + } + + auto input0 = expr->inputs()[0]; + auto expr0 = input0->expr().first; + auto input1 = expr->inputs()[1]; + auto expr1 = input1->expr().first; + + EXPRP constExpr = nullptr; + VARP constVar = nullptr; + + if (helpers::IsConstant(expr0) && helpers::IsConstant(expr1)) { + return noPost; + } + if (helpers::IsConstant(expr0)) { + constExpr = expr0; + constVar = input0; + squeezeExpr = expr1; + } else if (helpers::IsConstant(expr1)) { + constExpr = expr1; + constVar = input1; + squeezeExpr = expr0; + } else { + return noPost; + } + + if (constExpr->get() == nullptr) { // expr const + if (constVar->getInfo()->dim.size() > 1) { + return noPost; + } + } else { // op const + auto constParam = constExpr->get()->main_as_Blob(); + if (constParam->dims()->size() > 1) { + return noPost; + } + } + + if (!squeezeExpr->get() || squeezeExpr->get()->type() != OpType::OpType_Squeeze) { + return noPost; + } + auto squeezeDims = squeezeExpr->get()->main_as_SqueezeParam()->squeezeDims(); + if (squeezeDims->size() != 1) { + return noPost; + } + if ((squeezeDims->data()[0] == -1) || (squeezeDims->data()[0] == 3)) { + return noPost; + } + + returnPost = Conv1dPostCases::BiasAdd; + } + // relu + else if (opType == OpType::OpType_ReLU || opType == OpType::OpType_ReLU6) { + auto input = expr->inputs()[0]; + auto inputExpr = input->expr().first; + + if (!inputExpr->get() || inputExpr->get()->type() != OpType::OpType_Squeeze) { + return noPost; + } + squeezeExpr = inputExpr; + + returnPost = Conv1dPostCases::Relu; + } + else { + return noPost; + } + + if (squeezeExpr != nullptr) { + auto squeezeInput = squeezeExpr->inputs()[0]; + auto squeezeInputExpr = squeezeInput->expr().first; + if (squeezeInputExpr->get() && squeezeInputExpr->get()->main_type() == OpParameter_Convolution2D && squeezeInputExpr->outputs().size() == 1) { + if (compressFileName != "") { + auto quantParams = findQuantParameters(proto, squeezeInputExpr->outputName(0)); + // some conv1d squeeze may not be considered + if (quantParams.weight_size() != 0) { + return noPost; + } + } + } + } + + return returnPost; +}; + +static auto gRegister = []() { + auto match = [](EXPRP expr) { + auto postCase = getConv1dPostCase(expr); + if (postCase != Conv1dPostCases::None) { + return true; + } + + return false; + }; + + auto transform = [](EXPRP expr) { + auto postCase = getConv1dPostCase(expr); + + if (postCase == Conv1dPostCases::BiasAdd) { + auto input0 = expr->inputs()[0]; + auto expr0 = input0->expr().first; + auto input1 = expr->inputs()[1]; + auto expr1 = input1->expr().first; + + EXPRP constExpr = nullptr; + VARP constVar = nullptr; + EXPRP squeezeExpr = nullptr; + VARP squeezeInput = nullptr; + int constIndex = 0; + std::vector newBiasAddInputs; + + if (helpers::IsConstant(expr0)) { + constExpr = expr0; + constVar = input0; + squeezeExpr = expr1; + squeezeInput = expr1->inputs()[0]; + constIndex = 0; + } else if (helpers::IsConstant(expr1)) { + constExpr = expr1; + constVar = input1; + squeezeExpr = expr0; + squeezeInput = expr0->inputs()[0]; + constIndex = 1; + } + + auto squeezeInputExpr = squeezeInput->expr().first; + if (squeezeInputExpr->get() && squeezeInputExpr->get()->main_type() == OpParameter_Convolution2D && squeezeInputExpr->outputs().size() == 1) { + auto convInput = squeezeInputExpr->inputs()[0]; + auto newConvExpr = Expr::create(squeezeInputExpr->extra(), {convInput}); + newConvExpr->setName(squeezeInputExpr->name()); + auto newConvOutput = Variable::create(newConvExpr, 0); + newConvOutput->setName(squeezeInputExpr->outputName(0)); + squeezeInput = newConvOutput; + } + + if (constIndex == 0) { + newBiasAddInputs.push_back(constVar); + newBiasAddInputs.push_back(squeezeInput); + } else { + newBiasAddInputs.push_back(squeezeInput); + newBiasAddInputs.push_back(constVar); + } + + auto newBiasAddExpr = Expr::create(expr->extra(), std::move(newBiasAddInputs)); + newBiasAddExpr->setName(expr->name()); + auto newBiasAddVar = Variable::create(newBiasAddExpr, 0); + newBiasAddVar->setName(expr->outputName(0)); + auto newSqueezeExpr = Expr::create(squeezeExpr->extra(), {newBiasAddVar}); + newSqueezeExpr->setName(squeezeExpr->name()); + auto newSqueezeVar = Variable::create(newSqueezeExpr, 0); + newSqueezeVar->setName(squeezeExpr->outputName(0)); + + Expr::replace(expr, newSqueezeExpr); + return true; + } + + if (postCase == Conv1dPostCases::Relu) { + auto input = expr->inputs()[0]; + auto squeezeExpr = input->expr().first; + auto squeezeInput = squeezeExpr->inputs()[0]; + auto squeezeInputExpr = squeezeInput->expr().first; + + if (squeezeInputExpr->get() && squeezeInputExpr->get()->main_type() == OpParameter_Convolution2D && squeezeInputExpr->outputs().size() == 1) { + auto convInput = squeezeInputExpr->inputs()[0]; + auto newConvExpr = Expr::create(squeezeInputExpr->extra(), {convInput}); + newConvExpr->setName(squeezeInputExpr->name()); + auto newConvOutput = Variable::create(newConvExpr, 0); + newConvOutput->setName(squeezeInputExpr->outputName(0)); + squeezeInput = newConvOutput; + } + + auto newReluExpr = Expr::create(expr->extra(), {squeezeInput}); + newReluExpr->setName(expr->name()); + auto newReluVar = Variable::create(newReluExpr, 0); + newReluVar->setName(expr->outputName(0)); + auto newSqueezeExpr = Expr::create(squeezeExpr->extra(), {newReluVar}); + newSqueezeExpr->setName(squeezeExpr->name()); + auto newSqueezeVar = Variable::create(newSqueezeExpr, 0); + newSqueezeVar->setName(squeezeExpr->outputName(0)); + + Expr::replace(expr, newSqueezeExpr); + return true; + } + + return false; + }; + + TemplateMerge::getInstance("Merge").insertTemplate("Conv1dSqueezeMove", match, transform, + PASS_PRIORITY_HIGH); + return true; +}(); + +} +} // namespace MNN diff --git a/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp b/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp index a7d8d958..6e167ef9 100644 --- a/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp +++ b/tools/converter/source/optimizer/merge/ConvBNReluFuseToConvInt8.cpp @@ -136,8 +136,7 @@ static auto gRegister = []() { const std::string& tensor_name = layer_proto.output(0).name(); if (tensor_name == convExpr->outputName(0)) { auto weightProto = layer_proto.weight(0); - auto ws = weightProto.scales(); - for (int i = 0; i < ws.size(); i++) { + for (int i = 0; i < weightProto.scales().size(); i++) { weightScaleVector.emplace_back(weightProto.scales(i)); } wClampMin = weightProto.clamp_min(); diff --git a/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp index 9ba35d45..9be28c24 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp @@ -313,7 +313,10 @@ public: } EXPRP convolutinExpr; if (!outputShape.empty()) { - auto output_shape = _Const(outputShape.data(), {static_cast(outputShape.size())}, NHWC, halide_type_of()); + // [1, outputHeight, outputWidth, 1] + outputShape.insert(outputShape.begin(), 1); + outputShape.push_back(1); + auto output_shape = _Const(outputShape.data(), {4}, NHWC, halide_type_of()); if (weightDataPtr) { // merge weight(bias) node to Conv parameter convolutinExpr = Expr::create(newOp.get(), {x, output_shape}); diff --git a/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp new file mode 100644 index 00000000..b0f4f581 --- /dev/null +++ b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp @@ -0,0 +1,227 @@ + +// +// OnnxEinsum.cpp +// MNNConverter +// +// Created by MNN on 2021/03/24. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "MNN_generated.h" +#include "OnnxExtraManager.hpp" +#include +namespace MNN { +namespace Express { + +class OnnxEinsumTransform : public OnnxExtraManager::Transform { +public: + virtual EXPRP onExecute(EXPRP expr) const override { + auto inputs = expr->inputs(); + auto op = expr->get(); + auto extraParam = op->main_as_Extra(); + std::string equation; + if (nullptr != extraParam->attr()) { + const int attrSize = extraParam->attr()->size(); + for (int i = 0; i < attrSize; ++i) { + auto attr = extraParam->attr()->GetAs(i); + const auto& key = attr->key()->str(); + if (key == "equation") { + equation = attr->s()->str(); + } + } + } + if (equation.empty()) { + MNN_ERROR("Can't convert Einsum for invalid Equation\n"); + return nullptr; + } + // Remove space + std::vector valid; + for (int i=0; i"); + if (pos == std::string::npos) { + MNN_ERROR("Can't convert Einsum for no support Equation:%s\n", equation.c_str()); + return nullptr; + } + auto left = equation.substr(0, pos); + auto right = equation.substr(pos+2, equation.size()); + if (expr->inputs().size() == 1 ){ + auto currentVar = expr->inputs()[0]; + std::map outputPos; + for (int i=0; i reduceAxis; + std::map inputPosRemap; + int pos = 0; + for (int i=0; i permuteDims; + for (int i=0; isetName(expr->name()); + return currentVar->expr().first; + } + if (inputs.size() !=2 ) { + MNN_ERROR("Can't convert Einsum for input size = %d\n", (int)inputs.size()); + return nullptr; + } + auto iPos = left.find(","); + auto input0 = left.substr(0, iPos); + auto input1 = left.substr(iPos+1, left.size()); + + std::map input0Pos; + for (int i=0; i input1Pos; + for (int i=0; i outputPos; + std::vector sumPos; + std::vector bothPos; + std::vector aPos; + std::vector bPos; + for (int i=0; iinputs()[0]; + auto var1 = expr->inputs()[1]; + if (sumPos.empty()) { + // Broadcast Mul + { + // Reshape + Transpose + std::vector reshapeDims(outputPos.size(), 0); + int insertPos = (int)input0Pos.size(); + std::vector transpose; + for (int i=0; isecond); + } + } + var0 = _Permute(_Reshape(var0, reshapeDims), transpose); + } + { + // Reshape + Transpose + std::vector reshapeDims(outputPos.size(), 0); + int insertPos = (int)input1Pos.size(); + std::vector transpose; + for (int i=0; isecond); + } + } + var1 = _Permute(_Reshape(var1, reshapeDims), transpose); + } + auto output = var0 * var1; + output->setName(expr->name()); + return output->expr().first; + } + // MatMul + { + // Reshape + Transpose + // AB -> A -> B -> sum + std::vector reshapeDims(input0Pos.size() + bPos.size(), 0); + for (int i = (int)input0Pos.size(); i transpose; + MNN_ASSERT(bothPos.size() + aPos.size() + bPos.size() + sumPos.size() == reshapeDims.size()); + for (int i=0; i A -> B -> sum + std::vector reshapeDims(input1Pos.size() + aPos.size(), 0); + for (int i = (int)input1Pos.size(); i transpose; + MNN_ASSERT(bothPos.size() + aPos.size() + bPos.size() + sumPos.size() == reshapeDims.size()); + for (int i=0; isetName(expr->name()); + return output->expr().first; + } +}; + +static auto gRegister = []() { + OnnxExtraManager::get()->insert("Einsum", std::shared_ptr(new OnnxEinsumTransform)); + return true; +}(); + +} // namespace Express +} // namespace MNN diff --git a/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp b/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp index 56a23912..4e8c8f2c 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxSlice.cpp @@ -86,6 +86,12 @@ public: // Use TF's stridedslice, turn onnx slice attribute to tf format auto rank = _Unsqueeze(_Rank(input), {0}); if (nullptr != axisVar) { + auto axisPtr = axisVar->readMap(); + if (nullptr != axisPtr) { + if (0 > axisPtr[0]) { + axisVar = axisVar + _Rank(input); + } + } auto shape = _Shape(input, true); auto defaultVar = _Fill(_Shape(axisVar, true), _Scalar(1)); auto mask = _Scalar(1) - _ScatterNd(axisVar, defaultVar, rank); diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp index a9c1baf1..6c596f04 100644 --- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp +++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp @@ -7,6 +7,9 @@ // #include "../PostTreatUtils.hpp" +#include "../../common/Global.hpp" +#include "config.hpp" + using namespace MNN; const std::set NC4HW4_OPs = { MNN::OpType_ConvInt8, @@ -40,14 +43,34 @@ const std::set NC4HW4_OPs = { MNN::OpType_FloatToInt8, MNN::OpType_ConvInt8, MNN::OpType_DepthwiseConvInt8, + MNN::OpType_GridSample, }; const std::set COMPABILITY_OPs = { MNN::OpType_ReLU, MNN::OpType_ReLU6, MNN::OpType_Concat, MNN::OpType_Slice, MNN::OpType_Permute, MNN::OpType_Selu, MNN::OpType_ConvertTensor, MNN::OpType_Sigmoid, MNN::OpType_Cast, MNN::OpType_BatchToSpaceND, MNN::OpType_SpaceToBatchND, MNN::OpType_Reshape, MNN::OpType_TanH, MNN::OpType_Eltwise, MNN::OpType_Padding, MNN::OpType_ELU, - MNN::OpType_Dropout}; + MNN::OpType_Dropout, MNN::OpType_UnaryOp, MNN::OpType_DepthToSpace, MNN::OpType_SpaceToDepth, +}; +const std::set COMPABILITY_NCHW_OPs = { + MNN::OpType_Transpose, + MNN::OpType_StridedSlice, + MNN::OpType_SliceTf, + MNN::OpType_Unsqueeze, + MNN::OpType_Squeeze, + MNN::OpType_Crop, + MNN::OpType_Tile, + MNN::OpType_Pack, + MNN::OpType_Unpack, + MNN::OpType_Fill, + MNN::OpType_BroadcastTo, + MNN::OpType_Padding, + MNN::OpType_Flatten, + MNN::OpType_ExpandDims, + MNN::OpType_ReverseSequence, + MNN::OpType_BinaryOp, +}; static bool _OpNeedConvertContent(OpType type, int index) { switch (type) { case OpType_Shape: @@ -62,6 +85,7 @@ static bool _OpNeedConvertContent(OpType type, int index) { case OpType_Interp: case OpType_Crop: case OpType_Reshape: + case OpType_GridSample: case OpType_Resize: case OpType_Padding: if (1 <= index) { @@ -73,6 +97,19 @@ static bool _OpNeedConvertContent(OpType type, int index) { } return true; } + +static bool isCompabilityOp(OpType type, MNN_DATA_FORMAT originTensorType, float version) { + if (COMPABILITY_OPs.find(type) != COMPABILITY_OPs.end()) { + return true; + } + if (version < 1.1f || originTensorType != MNN_DATA_FORMAT_NCHW) { + return false; + } + if (version < 1.2f && type == OpType_BinaryOp) { + return false; + } + return COMPABILITY_NCHW_OPs.find(type) != COMPABILITY_NCHW_OPs.end(); +} class AddTensorFormatConverter : public PostConverter { public: virtual bool onExecute(std::unique_ptr& net) const override { @@ -85,6 +122,8 @@ public: if (mNet->sourceType == MNN::NetSource_ONNX) { originTensorType = MNN::MNN_DATA_FORMAT_NCHW; } + auto config = Global::Get(); + auto version = config->targetVersion; // set the layout of every tensor // Don't support inplace @@ -98,7 +137,7 @@ public: type = iter->main.AsTensorConvertInfo()->dest; } else if (NC4HW4_OPs.find(iter->type) != NC4HW4_OPs.end()) { type = MNN::MNN_DATA_FORMAT_NC4HW4; - } else if (COMPABILITY_OPs.find(iter->type) != COMPABILITY_OPs.end()) { + } else if (isCompabilityOp(iter->type, originTensorType, version)) { int nc4hw4TypeNumber = 0; // NC4HW4 number int originTypeNumber = 0; for (int i = 0; i < iter->inputIndexes.size(); ++i) { diff --git a/tools/converter/source/optimizer/postconvert/FuseDupOp.cpp b/tools/converter/source/optimizer/postconvert/FuseDupOp.cpp new file mode 100644 index 00000000..18fa8959 --- /dev/null +++ b/tools/converter/source/optimizer/postconvert/FuseDupOp.cpp @@ -0,0 +1,122 @@ +// +// FuseDupOp.cpp +// MNNConverter +// +// Created by MNN on 2021/02/23. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "../PostTreatUtils.hpp" +#include +#include +using namespace MNN; +class FuseDupOp : public PostConverter { +public: + static bool isSameOp(const MNN::OpT* op0, const MNN::OpT* op1) { + if (op0->type != op1->type) { + return false; + } + if (op0->main.type != op1->main.type) { + return false; + } + if (op0->inputIndexes != op1->inputIndexes) { + return false; + } + if (op0->outputIndexes.size() != op1->outputIndexes.size()) { + return false; + } + if (op0->main.type == OpParameter_NONE) { + return true; + } + if (op0->type == OpType_ReLU) { + return op0->main.AsRelu()->slope == op1->main.AsRelu()->slope; + } + if (op0->type == OpType_ReLU6) { + return op0->main.AsRelu6()->maxValue == op1->main.AsRelu6()->maxValue && op0->main.AsRelu6()->minValue == op1->main.AsRelu6()->minValue; + } + if (op0->main.type == OpParameter_Blob) { + auto v0 = op0->main.AsBlob(); + auto v1 = op1->main.AsBlob(); + if (v0->dataFormat != v1->dataFormat) { + return false; + } + if (v0->dataType != v1->dataType) { + return false; + } + if (v0->dims != v1->dims) { + return false; + } + if (v0->dataFormat != v1->dataFormat) { + return false; + } + if (DataType_DT_INT32 == v0->dataType) { + return v0->int32s == v1->int32s; + } + } + if (op0->main.type == OpParameter_UnaryOp) { + return op0->main.AsUnaryOp()->opType == op1->main.AsUnaryOp()->opType; + } + if (op0->main.type == OpParameter_BinaryOp) { + return op0->main.AsBinaryOp()->opType == op1->main.AsBinaryOp()->opType; + } + if (op0->main.type == OpParameter_ReductionParam) { + return op0->main.AsReductionParam()->operation == op1->main.AsReductionParam()->operation; + } + return false; + } + virtual bool onExecute(std::unique_ptr& net) const override { + std::set unusefulOps; + std::map replaceIndexes; + std::set outputNames(net->outputName.begin(), net->outputName.end()); + for (int i=0; ioplists.size(); ++i) { + auto originOp = net->oplists[i].get(); + if (nullptr == originOp) { + continue; + } + for (int j=i+1; j < net->oplists.size(); ++j) { + auto judgeOp = net->oplists[j].get(); + if (nullptr == judgeOp) { + continue; + } + if (isSameOp(originOp, judgeOp)) { + auto keepOp = originOp, removeOp = judgeOp; + // outputs must keep + if (outputNames.find(removeOp->name) != outputNames.end()) { + keepOp = removeOp; + removeOp = originOp; + } + for (int v=0; voutputIndexes.size(); ++v) { + replaceIndexes.insert(std::make_pair(removeOp->outputIndexes[v], keepOp->outputIndexes[v])); + } + net->oplists[j].reset(); + } + } + } + // Remove nullptr op + auto tempOpList = std::move(net->oplists); + net->oplists.clear(); + for (int i=0; ioplists.emplace_back(std::move(tempOpList[i])); + } + } + + // Replace index + for (auto& op : net->oplists) { + for (int i=0; iinputIndexes.size(); ++i) { + auto iter = replaceIndexes.find(op->inputIndexes[i]); + if (iter!=replaceIndexes.end()) { + op->inputIndexes[i] = iter->second; + } + } + for (int i=0; ioutputIndexes.size(); ++i) { + auto iter = replaceIndexes.find(op->outputIndexes[i]); + if (iter!=replaceIndexes.end()) { + op->outputIndexes[i] = iter->second; + } + } + } + return true; + } +}; +static PostConverterRegister __l("FuseDupOp"); diff --git a/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp b/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp index 047a0720..441eded1 100644 --- a/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp +++ b/tools/converter/source/optimizer/postconvert/RemoveUnusefulOp.cpp @@ -20,7 +20,7 @@ public: bool shouldDeleteJudge(const MNN::OpT* op, const MNN::NetT* const netPtr) const override { static auto unuseOpType = std::vector({OpType_Seq2Out}); static auto unuseExtraOpType = - std::vector({"Identity", "NoOp", "Print", "Assert", "StopGradient", "Enter", "NextIteration"}); + std::vector({"Identity", "IdentityN", "NoOp", "Print", "Assert", "StopGradient", "Enter", "NextIteration"}); if (std::find(unuseOpType.begin(), unuseOpType.end(), op->type) != unuseOpType.end()) { return true; } @@ -51,6 +51,9 @@ public: return true; } } + if (op->type == OpType_Slice && op->outputIndexes.size() == 1) { + return true; + } return false; }; bool shouldRemoveUnusefulInputs(const MNN::OpT* op) const override { diff --git a/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp b/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp index ad50387e..83eae610 100644 --- a/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp +++ b/tools/converter/source/optimizer/tfextra/TFConvolutionMerge.cpp @@ -178,28 +178,24 @@ public: // NHWC => NMHWC (Raster: NCHW => NMCHW) auto x = _Concat(convs, 1); // NMHWC => NMAC (Raster: NMCHW => NMCA) - auto shape = convs[0]->getInfo()->dim; - int batch_n = shape[0]; - int kernel_h = shape[1]; - int kernel_w = shape[2]; - int input_c = shape[3]; - shape[1] = multiplier; - shape[2] = kernel_h * kernel_w; - x = _Reshape(x, shape); + auto shape = _Split(_Shape(convs[0]), {1, 1, 1, 1}, 0); + auto batch_n = shape[0]; + auto kernel_h = shape[1]; + auto kernel_w = shape[2]; + auto input_c = shape[3]; + auto multip = _Const(&multiplier, {1}, NHWC, halide_type_of()); + x = _Reshape(x, _Concat({batch_n, multip, _Multiply(kernel_h, kernel_w), input_c}, 0)); // NMAC => NACM (Raster: NMCA => NCMA) x = _Transpose(x, {0, 2, 3, 1}); - shape[0] = batch_n; - shape[1] = kernel_h; - shape[2] = kernel_w; - shape[3] = input_c * multiplier; + auto outputShape = _Concat({batch_n, kernel_h, kernel_w, _Multiply(input_c, multip)}, 0); // NACM => NHWC (NCMA => NCHW) std::unique_ptr reshape(new OpT); reshape->type = OpType_Reshape; + reshape->name = expr->name() + "_Reshape"; reshape->main.type = OpParameter_Reshape; reshape->main.value = new ReshapeT; - reshape->main.AsReshape()->dims = shape; reshape->main.AsReshape()->dimType = MNN_DATA_FORMAT_NHWC; - return (Expr::create(reshape.get(), {x})); + return (Expr::create(reshape.get(), {x, outputShape})); } }; diff --git a/tools/converter/source/tensorflow/ReverseSequence.cpp b/tools/converter/source/tensorflow/ReverseSequence.cpp index 00dcecc2..88af23cf 100644 --- a/tools/converter/source/tensorflow/ReverseSequence.cpp +++ b/tools/converter/source/tensorflow/ReverseSequence.cpp @@ -36,3 +36,18 @@ void ReverseSequence::run(MNN::OpT *dstOp, TmpNode *srcNode) { } REGISTER_CONVERTER(ReverseSequence, ReverseSequence); + +DECLARE_OP_CONVERTER(Reverse); + +MNN::OpType Reverse::opType() { + return MNN::OpType_Reverse; +} +MNN::OpParameter Reverse::type() { + return MNN::OpParameter_NONE; +} + +void Reverse::run(MNN::OpT *dstOp, TmpNode *srcNode) { + dstOp->main.value = nullptr; +} + +REGISTER_CONVERTER(Reverse, ReverseV2); diff --git a/tools/converter/source/tensorflow/TFGraphResolver.cpp b/tools/converter/source/tensorflow/TFGraphResolver.cpp index 545d7cac..ef0ea36e 100644 --- a/tools/converter/source/tensorflow/TFGraphResolver.cpp +++ b/tools/converter/source/tensorflow/TFGraphResolver.cpp @@ -290,8 +290,7 @@ void TFGraphResolver::ResolveQuantization( } } -TFGraphResolver::TFGraphResolver(const tensorflow::GraphDef& graph_def, - const common::Options& options) { +TFGraphResolver::TFGraphResolver(const tensorflow::GraphDef& graph_def) { std::unique_ptr tf_graph(new TFGraph); const int count = graph_def.node_size(); for (int i = 0; i < count; ++i) { @@ -302,16 +301,6 @@ TFGraphResolver::TFGraphResolver(const tensorflow::GraphDef& graph_def, graphs_.push_back(std::move(tf_graph)); TFGraph* main_graph = graphs_.back().get(); - // Resolve quantization. - if (options.doCompress) { - const auto& pipeline = options.compressionPipeline; - for (const auto& progress : pipeline.progress()) { - if (progress.type != CompressionAlgo::QUANTIZE) { - continue; - } - ResolveQuantization(main_graph, progress.quant_params); - } - } } const TFGraph* TFGraphResolver::graph(const int graph_index) const { diff --git a/tools/converter/source/tensorflow/TFGraphResolver.hpp b/tools/converter/source/tensorflow/TFGraphResolver.hpp index 0932ec71..bf2085d8 100644 --- a/tools/converter/source/tensorflow/TFGraphResolver.hpp +++ b/tools/converter/source/tensorflow/TFGraphResolver.hpp @@ -12,7 +12,6 @@ #include #include -#include "options.hpp" #include "MNN/MNNDefine.h" #include "graph.pb.h" #include "MNN_generated.h" @@ -59,8 +58,7 @@ class TFGraph { class TFGraphResolver { public: - explicit TFGraphResolver(const tensorflow::GraphDef& graph_def, - const common::Options& options); + explicit TFGraphResolver(const tensorflow::GraphDef& graph_def); virtual ~TFGraphResolver() = default; TFGraph* graph(const int graph_index); diff --git a/tools/converter/source/tensorflow/tensorflowConverter.cpp b/tools/converter/source/tensorflow/tensorflowConverter.cpp index e63e66a0..8a4e1914 100644 --- a/tools/converter/source/tensorflow/tensorflowConverter.cpp +++ b/tools/converter/source/tensorflow/tensorflowConverter.cpp @@ -11,18 +11,16 @@ #include "logkit.h" #include "TFGraphResolver.hpp" -#include "options.hpp" #include "tensorflowConverter.hpp" int tensorflow2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr &netT) { // Load tensorflow model. tensorflow::GraphDef tfGraph; bool success = tf_read_proto_from_binary(inputModel.c_str(), &tfGraph); DCHECK(success) << "read_proto_from_binary failed"; - TFGraphResolver resolver(tfGraph, options); + TFGraphResolver resolver(tfGraph); for (int i = 0; i < resolver.graph_size(); ++i) { const TFGraph *graph = resolver.graph(i); auto graph_proto = graph->ToProto(); diff --git a/tools/converter/source/tflite/ConvolutionTflite.cpp b/tools/converter/source/tflite/ConvolutionTflite.cpp index 5f077f94..69654f42 100644 --- a/tools/converter/source/tflite/ConvolutionTflite.cpp +++ b/tools/converter/source/tflite/ConvolutionTflite.cpp @@ -248,11 +248,11 @@ void TransposeConvTflite::run(MNN::OpT *dstOp, const std::unique_ptr weightData; weightData.resize(weightSize); auto originalWeightPtr = reinterpret_cast(tfliteModelBuffer[weightTensor->buffer]->data.data()); - convertDataFormatTflite(originalWeightPtr, weightData.data(), kh, kw, ci, co); + convertDataFormatTflite(originalWeightPtr, weightData.data(), kh, kw, ci, co, true); convolution2DFloat->weight = weightData; // bias std::vector biasData(co, 0.0f); - if (inputSize == 3) { + if (inputSize == 4) { const auto& biasTensor = tfliteTensors[tfliteOp->inputs[2]]; auto biasDataPtr = reinterpret_cast(tfliteModelBuffer[biasTensor->buffer]->data.data()); if(biasDataPtr){ @@ -278,9 +278,6 @@ void TransposeConvTflite::run(MNN::OpT *dstOp, const std::unique_ptrstrideY = tfliteConvOption->stride_h; common->padMode = MNN::PadMode_SAME; common->hasOutputShape = true; - if (tfliteConvOption->padding == tflite::Padding_VALID) { - common->padMode = MNN::PadMode_VALID; - } dstOp->main.value = convolution2DFloat; } @@ -292,7 +289,6 @@ void TransposeConvTflite::run(MNN::OpT *dstOp, const std::unique_ptrinputIndexes[0] = tfliteOp->inputs[2]; dstOp->inputIndexes[1] = tfliteOp->inputs[0]; dstOp->outputIndexes[0] = tfliteOp->outputs[0]; - } diff --git a/tools/converter/source/tflite/TfliteUtils.cpp b/tools/converter/source/tflite/TfliteUtils.cpp index 7bb743c8..d3ff8f53 100644 --- a/tools/converter/source/tflite/TfliteUtils.cpp +++ b/tools/converter/source/tflite/TfliteUtils.cpp @@ -93,23 +93,23 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, *quantized_multiplier = static_cast(q_fixed); } -bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO) { +bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO, bool deconv) { DCHECK(KH > 0); DCHECK(KW > 0); DCHECK(CI > 0); DCHECK(CO > 0); DCHECK(src != nullptr); - // CO KH KW CI --> CO CI KH KW + // deconv: CI KH KW CO --> CO CI KH KW + // conv : CO KH KW CI --> CO CI KH KW for (int oc = 0; oc < CO; ++oc) { for (int ic = 0; ic < CI; ++ic) { for (int h = 0; h < KH; ++h) { for (int w = 0; w < KW; ++w) { - dst[(oc * CI + ic) * KH * KW + h * KW + w] = src[(oc * KH + h) * KW * CI + w * CI + ic]; + dst[(oc * CI + ic) * KH * KW + h * KW + w] = deconv ? src[(ic * KH + h) * KW * CO + w * CO + oc] : src[(oc * KH + h) * KW * CI + w * CI + ic]; } } } } - return true; } diff --git a/tools/converter/source/tflite/TfliteUtils.hpp b/tools/converter/source/tflite/TfliteUtils.hpp index b05e631e..e902b7ba 100644 --- a/tools/converter/source/tflite/TfliteUtils.hpp +++ b/tools/converter/source/tflite/TfliteUtils.hpp @@ -30,7 +30,7 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, // weight format converter // CO KH KW CI --> CO CI KH KW -bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO); +bool convertDataFormatTflite(const float* src, float* dst, int KH, int KW, int CI, int CO, bool deconv = false); MNN::DataType TfliteDataTypeToMNN(tflite::TensorType type); diff --git a/tools/converter/source/tflite/UnaryTflite.cpp b/tools/converter/source/tflite/UnaryTflite.cpp index 4f7e57c2..792ad5b4 100644 --- a/tools/converter/source/tflite/UnaryTflite.cpp +++ b/tools/converter/source/tflite/UnaryTflite.cpp @@ -55,6 +55,10 @@ void UnaryTflite::run(MNN::OpT* dstOp, const std::unique_ptr& param->opType=MNN::UnaryOpOperation_SIN; break; } + case tflite::BuiltinOperator_HARD_SWISH:{ + param->opType=MNN::UnaryOpOperation_HARDSWISH; + break; + } default:{ LOG(ERROR) << "MNN Converter Not " "Supported!!! UnaryOp: " @@ -74,3 +78,4 @@ REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_NEG); REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_SQRT); REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_LOG); REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_SIN); +REGISTER_CONVERTER(UnaryTflite, BuiltinOperator_HARD_SWISH); diff --git a/tools/converter/source/tflite/liteConverter.cpp b/tools/converter/source/tflite/liteConverter.cpp index 35a31dd4..3027b531 100644 --- a/tools/converter/source/tflite/liteConverter.cpp +++ b/tools/converter/source/tflite/liteConverter.cpp @@ -12,7 +12,6 @@ #include "liteConverter.hpp" #include "liteOpConverter.hpp" -#include "options.hpp" static MNN::DataType _dataTypeMap(tflite::TensorType type) { switch (type) { @@ -102,7 +101,7 @@ static bool needExtractInput(uint32_t opCode) { } int tflite2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& MNNNetT) { + std::unique_ptr& MNNNetT) { const std::string model_name = inputModel; auto model = std::shared_ptr(new TfliteModel(model_name)); model->readModel(); @@ -123,7 +122,8 @@ int tflite2MNNNet(const std::string inputModel, const std::string bizCode, for (int j = 0; j < opNums; ++j) { const int opcodeIndex = ops[j]->opcode_index; const auto opCode = tfliteOpSet[opcodeIndex]->builtin_code; - if (opCode == tflite::BuiltinOperator_CONV_2D || opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { + if (opCode == tflite::BuiltinOperator_CONV_2D || opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D || + opCode == tflite::BuiltinOperator_TRANSPOSE_CONV) { const int weightIndex = ops[j]->inputs[1]; const auto& weightTensor = tensors[weightIndex]; quantizedModel = weightTensor->type == tflite::TensorType_UINT8; diff --git a/tools/converter/source/torchscript/torchscriptConverter.cpp b/tools/converter/source/torchscript/torchscriptConverter.cpp index 83781d56..7eb93cd6 100644 --- a/tools/converter/source/torchscript/torchscriptConverter.cpp +++ b/tools/converter/source/torchscript/torchscriptConverter.cpp @@ -14,7 +14,6 @@ #include "flatbuffers/idl.h" #include "flatbuffers/minireflect.h" #include "flatbuffers/util.h" -#include "options.hpp" #include "TorchScriptDialect.hpp" #include "MLIRGen.hpp" @@ -85,7 +84,7 @@ std::vector getIntVector(mlir::Attribute a) { } int torchscript2MNNNet(const std::string inputModel, const std::string bizCode, - const common::Options& options, std::unique_ptr& netT) { + std::unique_ptr& netT) { printf("TorchScript Converter!\n"); mlir::MLIRContext context; // Load our Dialect in this MLIR Context. diff --git a/tools/cpp/IDSTEncoder.hpp b/tools/cpp/IDSTEncoder.hpp new file mode 100644 index 00000000..77dd13e5 --- /dev/null +++ b/tools/cpp/IDSTEncoder.hpp @@ -0,0 +1,416 @@ +// +// IDSTEncoder.hpp +// MNN +// +// Created by MNN on 2021/02/26. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef IDSTENCODER_HPP +#define IDSTENCODER_HPP + +#include +#include +#include "MNN_generated.h" + +using namespace MNN; + +namespace IDSTEncoder { + +static void WriteBlobDim(std::ostream &out, std::vector dims) +{ + char tmp[4]; + ((unsigned char *)tmp)[0] = (unsigned char)dims.size(); + out.write(tmp, 1); + for (int i = 0; i < dims.size(); i++) + { + unsigned short tmpShort = (unsigned short)dims[i]; + out.write((const char*)(&tmpShort), 2); + } +} + +static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits) +{ + memset(buf, 0, buf_len); + char *tmp = buf; + int iOffset = 0; + unsigned char cMask = (1 << iNeedBits) - 1; + for (int i = 0; i < arr_len; i++) + { + char value = arr[i]; + int uShift = 8 - iNeedBits - iOffset % 8; + if (uShift < 0) + { + tmp[iOffset / 8] |= ((value & cMask) >> (0 - uShift)); + tmp[(iOffset / 8) + 1] |= ((value & cMask) << (8 + uShift)); + } + else + { + tmp[iOffset / 8] |= ((value & cMask) << uShift); + } + iOffset += iNeedBits; + if (iOffset % 8 == 0) + { + tmp += iOffset / 8; + iOffset = 0; + } + } +} + +static void GetWeightSet(std::set &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) +{ + setWeight.clear(); + if (asymmetricQuantFlag) { + for (int i = 0; i < channel; i++) + { + float min = alphaData[2*i]; + float alpha = alphaData[2*i+1]; + if (alpha <= 1e-6f) + { + setWeight.insert(-128); + continue; + } + for (int j = 0; j < area; j++) + { + float weight = weightData[i * area + j]; + setWeight.insert(fmax(fmin(round((weight - min) / alpha) + (-128), 127), -128)); + } + } + } else { + for (int i = 0; i < channel; i++) + { + float alpha = alphaData[i]; + if (alpha <= 1e-6f) + { + setWeight.insert(0); + continue; + } + for (int j = 0; j < area; j++) + { + float weight = weightData[i * area + j]; + setWeight.insert(fmax(fmin(round(weight / alpha), 127), -128)); + } + } + } +} + +static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1) +{ + nnz = 0; + int iPreIdx = 0; + float sparsity; + if (asymmetricQuantFlag) { + for (int i = 0; i < weightSize; i++) + { + float min = alphaData[2*(i/area)]; + float alpha = alphaData[2*(i/area)+1]; + int zeroQuant = -128; + if (alpha > 1e-6) { + zeroQuant = round((0.0f - min) / alpha) + (-128); + } + + float weight = weightData[i]; + int value = -128; + if (alpha > 1e-6) + { + value = round((weight - min) / alpha) + (-128); + } + + if (value != zeroQuant) + { + nnz++; + iPreIdx = i; + } + if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1)) + { + nnz++; + iPreIdx = i; + } + } + } else { + for (int i = 0; i < weightSize; i++) + { + float alpha = alphaData[i / area]; + float weight = weightData[i]; + int value = 0; + if (alpha > 1e-6f) + { + value = round(weight / alpha); + } + + if (value != 0) + { + nnz++; + iPreIdx = i; + } + if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1)) + { + nnz++; + iPreIdx = i; + } + } + } + sparsity = 1 - 1.0f * nnz / weightSize; + return sparsity; +} + +static unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) +{ + size_t szBestSize = 1000000000; + unsigned int best_nnz = 0; + for (int i = 2; i < 9; i++) + { + unsigned int nnz = 0; + GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1); + size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize); + if (tmp < szBestSize) + { + iMaxStepBits = (unsigned char) i; + szBestSize = tmp; + best_nnz = nnz; + } + } + return best_nnz; +} + +static void WriteCQBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) +{ + //push values into buffer + //Find int values in all blobs and check; + std::set setWeight; + GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag); + int iCount = setWeight.size(); + int iNeedBits = ceil(log2(iCount)); + if (iNeedBits > 8) { + MNN_ERROR("The Bits need large than 8, the model may be error for user\n"); + return; + } + std::map mapWeight; + int iIdx = 0; + for (std::set::iterator it = setWeight.begin(); it != setWeight.end(); it++) + { + mapWeight[*it] = iIdx++; + } + size_t buf_len = size_t(ceil(0.125 * iNeedBits * area * channel)); + char *buf = new char[buf_len]; + { + char *arr = new char[area * channel]; + char *tmp = arr; + if (asymmetricQuantFlag) { + for (int i = 0; i < channel; i++) + { + float min = alphaData[2*i]; + float alpha = alphaData[2*i+1]; + for (int j = 0; j < area; j++) + { + float weight = weightData[i * area + j]; + int value = -128; + if (alpha > 1e-6f) + { + value = fmax(fmin(round((weight - min) / alpha) + (-128), 127), -128); + } + *tmp = mapWeight[value]; + tmp++; + } + } + } else { + for (int i = 0; i < channel; i++) + { + float alpha = alphaData[i]; + for (int j = 0; j < area; j++) + { + float weight = weightData[i * area + j]; + int value = 0; + if (alpha > 1e-6f) + { + value = fmax(fmin(round(weight / alpha), 127), -128); + } + *tmp = mapWeight[value]; + tmp++; + } + } + } + FillBuffer(buf, buf_len, arr, area * channel, iNeedBits); + delete[] arr; + } + //begin write to file + { + char tmp[100]; + //1. weights blob shape(unsigned int32) + WriteBlobDim(out, {channel, area}); + // 2. Avalable values Count(unsigned char) + tmp[0] = (unsigned char)iCount; + out.write(tmp, 1); + // 3. valueset(signed char * valueset_size) + for (auto it = setWeight.begin(); it != setWeight.end(); it++) + { + tmp[0] = (unsigned char)*it; + out.write(tmp, 1); + } + // 4. weights indexes(size = ceil(0.125*weights_count*ceil(log2(Avalable_values_Count)))) + out.write(buf, buf_len); + //g_totalSize += 1 + setWeight.size() + buf_len; + } + delete[] buf; +} + +static void WriteSparseQuanBlobs(std::ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag) +{ + std::set setWeight; + GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag); + int iDataNeedBits = ceil(log2(setWeight.size())); + unsigned int nnz = 0; + int weightSize = area * channel; + std::map mapWeight; + { + int iIdx = 0; + for (auto it = setWeight.begin(); it != setWeight.end(); it++) + { + mapWeight[*it] = iIdx++; + } + } + unsigned char iNeedBits; + nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag); + //weight buf + size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz)); + char* data_buf = new char[data_buf_len]; + //sparse COO buf + size_t buf_len = size_t(ceil(0.125 * iNeedBits * nnz)); + char* buf = new char[buf_len]; + { //fill buf with step values; + unsigned char* arr_idx = new unsigned char[nnz]; + unsigned char* data_arr = new unsigned char[nnz]; + unsigned char* tmp = arr_idx; + int iMaxStep = pow(2, iNeedBits) - 1; + int iPreIdx = 0; + unsigned char* dTmp = data_arr; + if (asymmetricQuantFlag) { + for (int i = 0; i < weightSize; i++) + { + float min = alphaData[2*(i/area)]; + float alpha = alphaData[2*(i/area)+1]; + int zeroQuant = -128; + if (alpha > 1e-6) { + zeroQuant = round((0.0f - min) / alpha) + (-128); + } + + float weight = weightData[i]; + int value = -128; + if (alpha > 1e-6) + { + value = round((weight - min) / alpha) + (-128); + } + + if (value != zeroQuant) + { + *dTmp = mapWeight[value]; + *tmp = i - iPreIdx; + iPreIdx = i; + tmp++; + dTmp++; + } + if (i - iPreIdx >= iMaxStep) + { + *dTmp = mapWeight[zeroQuant]; + *tmp = i - iPreIdx; + iPreIdx = i; + tmp++; + dTmp++; + } + } + } else { + for (int i = 0; i < weightSize; i++) + { + float alpha = alphaData[i / area]; + float weight = weightData[i]; + int value = 0; + if (alpha > 1e-6f) + { + value = round(weight / alpha); + } + + if (value != 0) + { + *dTmp = mapWeight[value]; + *tmp = i - iPreIdx; + iPreIdx = i; + tmp++; + dTmp++; + } + if (i - iPreIdx >= iMaxStep) + { + *dTmp = mapWeight[0]; + *tmp = i - iPreIdx; + iPreIdx = i; + tmp++; + dTmp++; + } + } + } + FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits); + FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits); + delete[] arr_idx; + delete[] data_arr; + } + { //write + char tmp[100]; + // 1.weights blob shape(unsigned int32) + WriteBlobDim(out, {channel, area}); + // 2. nnz + out.write((const char*) &nnz, 4); + // 3. max_step use # bits () (unsigned char) + out.write((const char*) &iNeedBits, 1); + // 4. buf for steps ceil(nnz*step need bits/8) + out.write(buf, buf_len); + // 5. Avalable values Count(unsigned char) + tmp[0] = (unsigned char) setWeight.size(); + out.write(tmp, 1); + // 6. valueset(signed char * valueset_size) + for (auto it = setWeight.begin(); it != setWeight.end(); it++) + { + tmp[0] = (unsigned char) *it; + out.write(tmp, 1); + } + // 7. none zero weights indexes(nnz*ceil(log2(Avalable_values_Count))/8) + out.write((const char*) data_buf, data_buf_len); + } + delete[] buf; + delete[] data_buf; +} + +static std::unique_ptr encode(const std::vector& weight, const std::vector& scale, int kernelSize, int kernelNum, + bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin) { + std::ostringstream outputStringStreamCQ, outputStringStreamSQ; + WriteCQBlobs(outputStringStreamCQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag); + WriteSparseQuanBlobs(outputStringStreamSQ, weight.data(), scale.data(), kernelSize, kernelNum, asymmetricQuantFlag); + std::unique_ptr idst(new IDSTQuanT); + auto cqStr = outputStringStreamCQ.str(); + auto sqStr = outputStringStreamSQ.str(); + int int8Size = kernelNum * kernelSize; + if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) { + idst->type = 4; + idst->aMax = kernelNum; + idst->buffer.resize(int8Size); + ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size); + } else if (cqStr.size() <= sqStr.size()) { + idst->type = 1; + idst->buffer.resize(cqStr.size()); + ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size()); + } else { + idst->type = 2; + idst->buffer.resize(sqStr.size()); + ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size()); + } + idst->alpha.resize(scale.size()); + ::memcpy(idst->alpha.data(), scale.data(), scale.size() * sizeof(float)); + idst->quantScale = 1.f; + if (asymmetricQuantFlag) { + idst->readType = kernelNum; + idst->aMin = clampMin; + } + return idst; +} + +} // namespace IDSTEncoder + +#endif // IDSTENCODER_HPP diff --git a/tools/cpp/MNNV2Basic.cpp b/tools/cpp/MNNV2Basic.cpp index a9a7a81b..ee739ec1 100644 --- a/tools/cpp/MNNV2Basic.cpp +++ b/tools/cpp/MNNV2Basic.cpp @@ -103,7 +103,7 @@ static inline int64_t getTimeInUs() { static int test_main(int argc, const char* argv[]) { if (argc < 2) { MNN_PRINT("========================================================================\n"); - MNN_PRINT("Arguments: model.MNN runTimes saveAllTensors forwardType numberThread size\n"); + MNN_PRINT("Arguments: model.MNN runLoops saveAllTensors forwardType numberThread inputSize precision\n"); MNN_PRINT("========================================================================\n"); return -1; } @@ -142,6 +142,11 @@ static int test_main(int argc, const char* argv[]) { MNN_PRINT("Use extra forward type: %d\n", type); } + int modeNum = 4; + if (argc > 5) { + modeNum = ::atoi(argv[5]); + } + // input dims std::vector inputDims; if (argc > 6) { @@ -164,9 +169,9 @@ static int test_main(int argc, const char* argv[]) { } MNN_PRINT("\n"); - int numThread = 4; - if (argc > 5) { - numThread = ::atoi(argv[5]); + int precision = BackendConfig::Precision_Low; + if (argc > 7) { + precision = atoi(argv[7]); } // create net @@ -182,13 +187,14 @@ static int test_main(int argc, const char* argv[]) { // create session MNN::ScheduleConfig config; config.type = type; - config.numThread = numThread; + /*modeNum means gpuMode for GPU usage, Or means numThread for CPU usage.*/ + config.numThread = modeNum; // If type not fount, let it failed config.backupType = type; BackendConfig backendConfig; // config.path.outputs.push_back("ResizeBilinear_2"); // backendConfig.power = BackendConfig::Power_High; - backendConfig.precision = BackendConfig::Precision_Low; + backendConfig.precision = static_cast(precision); // backendConfig.memory = BackendConfig::Memory_High; config.backendConfig = &backendConfig; MNN::Session* session = NULL; @@ -361,7 +367,7 @@ static int test_main(int argc, const char* argv[]) { { MNN::Tensor expectTensor2(iter.second, iter.second->getDimensionType()); iter.second->copyToHostTensor(&expectTensor2); - auto outputFile = pwd + iter.first + ".txt"; + auto outputFile = pwd + "/output/" + iter.first + ".txt"; if (iter.second->size() > 0) { dumpTensor2File(&expectTensor2, outputFile.c_str()); } @@ -371,7 +377,7 @@ static int test_main(int argc, const char* argv[]) { // benchmark. for CPU, op time means calc duration; for others, op time means schedule duration. { int t = runTime; - MNN_PRINT("Run %d time:\n", t); + MNN_PRINT("precision:%d, Run %d time:\n", backendConfig.precision, t); std::map> opTimes; std::map opTypes; uint64_t opBegin = 0; @@ -401,11 +407,9 @@ static int test_main(int argc, const char* argv[]) { std::vector times(t, 0.0f); for (int i = 0; i < t; ++i) { auto begin = getTimeInUs(); - inputTensor->copyFromHostTensor(&givenTensor); net->runSessionWithCallBackInfo(session, beforeCallBack, afterCallBack, false); outputTensor->copyToHostTensor(&expectTensor); - auto end = getTimeInUs(); times[i] = (end - begin) / 1000.0f; } diff --git a/tools/cpp/backendTest.cpp b/tools/cpp/backendTest.cpp index 1c7a4f4c..64c2362e 100644 --- a/tools/cpp/backendTest.cpp +++ b/tools/cpp/backendTest.cpp @@ -34,7 +34,7 @@ inline T stringConvert(const char* number) { using namespace MNN; static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNForwardType compareType, float tolerance, - const std::map>& inputs, const std::string& stopOp, BackendConfig::PrecisionMode precision) { + const std::map>& inputs, const std::string& stopOp, BackendConfig::PrecisionMode precision, int modeNum) { std::vector> correctResult; int index; MNN::ScheduleConfig expectConfig, compareConfig; @@ -43,6 +43,7 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo expectConfig.type = expectType; compareConfig.type = compareType; compareConfig.backendConfig = &backendConfig; + compareConfig.mode = modeNum; auto expectSession = net->createSession(expectConfig); auto compareSession = net->createSession(compareConfig); @@ -58,7 +59,9 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo if (op->name() == stopOp) { return false; } - + if (op->type() == "Raster") { + return true; + } auto tensor = t[0]; if (tensor->elementSize() <= 0) { return true; @@ -74,6 +77,9 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo if (op->name() == stopOp) { return false; } + if (op->type() == "Raster") { + return true; + } auto tensor = t[0]; if (tensor->elementSize() <= 0) { return true; @@ -238,12 +244,17 @@ int main(int argc, const char* argv[]) { precision = (BackendConfig::PrecisionMode)atoi(argv[4]); } FUNC_PRINT(precision); + int modeNum = 1; + if(argc > 5) { + modeNum = atoi(argv[5]);//set gpu mode + } + FUNC_PRINT(modeNum); std::string stopOp = ""; - if (argc > 5) { - stopOp = argv[5]; + if (argc > 6) { + stopOp = argv[6]; } FUNC_PRINT_ALL(stopOp.c_str(), s); - compareForwadType(net.get(), MNN_FORWARD_CPU, type, tolerance, inputs, stopOp, precision); + compareForwadType(net.get(), MNN_FORWARD_CPU, type, tolerance, inputs, stopOp, precision, modeNum); return 0; } diff --git a/tools/cpp/testModelWithDescrisbe.cpp b/tools/cpp/testModelWithDescrisbe.cpp index df830bcd..309d5fdf 100644 --- a/tools/cpp/testModelWithDescrisbe.cpp +++ b/tools/cpp/testModelWithDescrisbe.cpp @@ -162,10 +162,22 @@ int main(int argc, const char* argv[]) { bool correct = true; for (int i = 0; i < numOfOuputs; ++i) { auto outputTensor = net->getSessionOutput(session, expectNames[i].c_str()); - std::ostringstream iStrOs; - iStrOs << i; - auto expectName = modelDir + iStrOs.str() + ".txt"; - auto expectTensor = createTensor(outputTensor, expectName); + MNN::Tensor* expectTensor = nullptr; + std::string expectName; + // First Check outputname.txt + { + std::ostringstream iStrOs; + iStrOs << expectNames[i]; + expectName = modelDir + iStrOs.str() + ".txt"; + expectTensor = createTensor(outputTensor, expectName); + } + if (!expectTensor) { + // Second check number outputs + std::ostringstream iStrOs; + iStrOs << i; + expectName = modelDir + iStrOs.str() + ".txt"; + expectTensor = createTensor(outputTensor, expectName); + } if (!expectTensor) { #if defined(_MSC_VER) std::cout << "Failed to open " << expectName << std::endl; diff --git a/tools/cpp/timeProfile.cpp b/tools/cpp/timeProfile.cpp index 29f95d0c..b252cc33 100644 --- a/tools/cpp/timeProfile.cpp +++ b/tools/cpp/timeProfile.cpp @@ -64,6 +64,12 @@ int main(int argc, const char* argv[]) { MNN_PRINT("%d ", dim); } MNN_PRINT("\n"); + int threadNumber = 4; + if (argc > 5) { + threadNumber = ::atoi(argv[4]); + MNN_PRINT("Set ThreadNumber = %d\n", threadNumber); + } + // revert MNN model if necessary auto revertor = std::unique_ptr(new Revert(fileName)); @@ -83,7 +89,7 @@ int main(int argc, const char* argv[]) { // create session MNN::ScheduleConfig config; config.type = type; - config.numThread = 4; + config.numThread = threadNumber; MNN::Session* session = NULL; session = net->createSession(config); auto inputTensor = net->getSessionInput(session, NULL); diff --git a/tools/quantization/Helper.cpp b/tools/quantization/Helper.cpp index 74b80271..882405fd 100644 --- a/tools/quantization/Helper.cpp +++ b/tools/quantization/Helper.cpp @@ -19,8 +19,12 @@ #endif #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" +#include +#include +#include +#include "core/TensorUtils.hpp" -std::set Helper::gNeedFeatureOp = {"Convolution", "ConvolutionDepthwise", "Eltwise", "Pooling"}; +std::set Helper::gNotNeedFeatureOp = { "Raster", "Pooling", "ReLU", "ReLU6", "Interp", "CropAndResize", "ROIPooling", "Gather", "GatherV2", "GatherND", "ScatterNd" }; std::set Helper::INT8SUPPORTED_OPS = { MNN::OpType_ConvInt8, MNN::OpType_DepthwiseConvInt8, MNN::OpType_PoolInt8, MNN::OpType_EltwiseInt8, @@ -38,7 +42,7 @@ bool Helper::fileExist(const std::string& file) { } #endif -void Helper::readImages(std::vector& images, const std::string& filePath, int* usedImageNum) { +void Helper::readClibrationFiles(std::vector& images, const std::string& filePath, int* usedImageNum) { int count = 0; #if defined(_MSC_VER) WIN32_FIND_DATA ffd; @@ -101,22 +105,69 @@ void Helper::readImages(std::vector& images, const std::string& fil } void Helper::preprocessInput(MNN::CV::ImageProcess* pretreat, int targetWidth, int targetHeight, - const std::string& inputImageFileName, MNN::Tensor* input) { - int originalWidth, originalHeight, comp; - auto bitmap32bits = stbi_load(inputImageFileName.c_str(), &originalWidth, &originalHeight, &comp, 4); + const std::string& filename, MNN::Tensor* input, Calibration::InputType inputType) { + if (inputType == Calibration::IMAGE) { + int originalWidth, originalHeight, comp; + auto bitmap32bits = stbi_load(filename.c_str(), &originalWidth, &originalHeight, &comp, 4); - DCHECK(bitmap32bits != nullptr) << "input image error!"; - MNN::CV::Matrix trans; - // choose resize or crop - // resize method - trans.setScale((float)(originalWidth - 1) / (float)(targetWidth - 1), - (float)(originalHeight - 1) / (float)(targetHeight - 1)); - // crop method - // trans.setTranslate(16.0f, 16.0f); - pretreat->setMatrix(trans); - pretreat->convert(bitmap32bits, originalWidth, originalHeight, 0, input); + DCHECK(bitmap32bits != nullptr) << "input image error!"; + MNN::CV::Matrix trans; + // choose resize or crop + // resize method + trans.setScale((float)(originalWidth - 1) / (float)(targetWidth - 1), + (float)(originalHeight - 1) / (float)(targetHeight - 1)); + // crop method + // trans.setTranslate(16.0f, 16.0f); + pretreat->setMatrix(trans); + pretreat->convert(bitmap32bits, originalWidth, originalHeight, 0, input); - stbi_image_free(bitmap32bits); + stbi_image_free(bitmap32bits); + } + if (inputType == Calibration::SEQUENCE) { + if (!stringEndWith(filename, ".txt")) { + MNN_ERROR("Error: only '.txt' files are supported for sequence input.\n"); + return; + } + + std::ifstream f(filename); + if (!f.is_open()) { + MNN_ERROR("open file %s failed.\n", filename.c_str()); + return; + } + + std::string line; + std::vector > rawData; + while (std::getline(f, line)) { + std::stringstream ss(line); + float v; + std::vector lineData; + while (ss >> v) { + lineData.emplace_back(v); + } + if (!lineData.empty()) { + rawData.emplace_back(lineData); + } + } + f.close(); + + if (rawData.empty()) { + MNN_ERROR("Error: no data found in file %s.", filename.c_str()); + return; + } + + std::vector data; + for (int i = 0; i< rawData.size(); i++) { + if (rawData[i].size() != rawData[0].size()) { + MNN_ERROR("Error: sequence length not equal in input file %s\n", filename.c_str()); + return; + } + data.insert(data.end(), rawData[i].begin(), rawData[i].end()); + } + + std::vector shape = {1, int(rawData.size()), int(rawData[0].size())}; + std::shared_ptr tensorWarp(MNN::Tensor::create(shape, input->getType(), data.data(), MNN::Tensor::CAFFE)); + input->copyFromHostTensor(tensorWarp.get()); + } } void Helper::invertData(float* dst, const float* src, int size) { @@ -128,3 +179,11 @@ void Helper::invertData(float* dst, const float* src, int size) { } } } + +bool Helper::stringEndWith(std::string const &fullString, std::string const &ending) { + if (fullString.length() >= ending.length()) { + return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending)); + } else { + return false; + } +} diff --git a/tools/quantization/Helper.hpp b/tools/quantization/Helper.hpp index 1fe956ca..9e313491 100644 --- a/tools/quantization/Helper.hpp +++ b/tools/quantization/Helper.hpp @@ -12,11 +12,12 @@ #include #include "MNN_generated.h" #include "logkit.h" +#include "calibration.hpp" #pragma once class Helper { public: - static std::set gNeedFeatureOp; + static std::set gNotNeedFeatureOp; static std::set INT8SUPPORTED_OPS; @@ -24,8 +25,9 @@ public: static std::set weightQuantizeMethod; static bool fileExist(const std::string& file); - static void readImages(std::vector& images, const std::string& filePath, int *usedImageNum); + static void readClibrationFiles(std::vector& images, const std::string& filePath, int *usedImageNum); static void preprocessInput(MNN::CV::ImageProcess* pretreat, int targetWidth, int targetHeight, - const std::string& inputImageFileName, MNN::Tensor* input); + const std::string& filename, MNN::Tensor* input, Calibration::InputType inputType); static void invertData(float* dst, const float* src, int size); + static bool stringEndWith(std::string const &fullString, std::string const &ending); }; diff --git a/tools/quantization/TensorStatistic.cpp b/tools/quantization/TensorStatistic.cpp index 4f79afac..918c80ed 100644 --- a/tools/quantization/TensorStatistic.cpp +++ b/tools/quantization/TensorStatistic.cpp @@ -35,21 +35,13 @@ static float _klDivergence(const std::vector& candidateDis, const std::ve TensorStatistic::TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, float featureClampValue, int binNumber, GET_THRESHOLD_METHOD thresholdMethod) : mOriginTensor(tensor), mName(name), mBinNumber(binNumber), mThresholdMethod(thresholdMethod), mFeatureClampValue(featureClampValue) { - MNN_ASSERT(tensor->dimensions() == 4); + // MNN_ASSERT(tensor->dimensions() == 4); if (method == "KL") { auto channel = tensor->channel(); - mRangePerChannel.resize(channel); - for (auto& iter : mRangePerChannel) { - iter.first = 100000.0f; // Min Init - iter.second = -100000.0f; // Max Init - } - mIntervals.resize(channel); - mValidChannel.resize(channel); + mRange.first = 100000.0f; // Min Init + mRange.second = -100000.0f; // Max Init mHostTensor.reset(new MNN::Tensor(tensor, MNN::Tensor::CAFFE)); - mDistribution.resize(channel); - for (int c = 0; c < mDistribution.size(); ++c) { - mDistribution[c].resize(mBinNumber); - } + mDistribution.resize(mBinNumber); bool isLittleAmountData = tensor->width() * tensor->height() < 100; if (isLittleAmountData) { mThresholdMethod = THRESHOLD_MAX; @@ -67,44 +59,35 @@ void TensorStatistic::updateRange() { int width = mHostTensor->width(); int height = mHostTensor->height(); auto area = width * height; + if (area == 0) { + area = 1; + } for (int n = 0; n < batch; ++n) { auto dataBatch = mHostTensor->host() + n * mHostTensor->stride(0); for (int c = 0; c < channel; ++c) { - int cIndex = c; - if (mMergeChannel) { - cIndex = 0; - } - auto minValue = mRangePerChannel[cIndex].first; - auto maxValue = mRangePerChannel[cIndex].second; + auto minValue = mRange.first; + auto maxValue = mRange.second; auto dataChannel = dataBatch + c * mHostTensor->stride(1); for (int v = 0; v < area; ++v) { minValue = std::min(minValue, dataChannel[v]); maxValue = std::max(maxValue, dataChannel[v]); } - mRangePerChannel[cIndex].first = minValue; - mRangePerChannel[cIndex].second = maxValue; + mRange.first = minValue; + mRange.second = maxValue; } } mVisited = true; } void TensorStatistic::resetDistribution() { - for (int i = 0; i < mIntervals.size(); ++i) { - int cIndex = i; - if (mMergeChannel) { - cIndex = 0; - } - auto maxValue = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first)); - mValidChannel[cIndex] = maxValue > 0.00001f; - mIntervals[cIndex] = 0.0f; - if (mValidChannel[cIndex]) { - mIntervals[cIndex] = (float)mBinNumber / maxValue; - } - } - for (auto& c : mDistribution) { - std::fill(c.begin(), c.end(), 1.0e-07); + auto maxValue = std::max(fabsf(mRange.second), fabsf(mRange.first)); + mValid = maxValue > 0.00001f; + mInterval = 0.0f; + if (mValid) { + mInterval = (float)mBinNumber / maxValue; } + std::fill(mDistribution.begin(), mDistribution.end(), 1.0e-07); // MNN_PRINT("==> %s max: %f\n", mName.c_str(),std::max(fabsf(mRangePerChannel[0].second), // fabsf(mRangePerChannel[0].first))); } @@ -119,19 +102,18 @@ void TensorStatistic::updateDistribution() { int width = mHostTensor->width(); int height = mHostTensor->height(); auto area = width * height; + if (area == 0) { + area = 1; + } for (int n = 0; n < batch; ++n) { auto dataBatch = mHostTensor->host() + n * mHostTensor->stride(0); for (int c = 0; c < channel; ++c) { - int cIndex = c; - if (mMergeChannel) { - cIndex = 0; - } - if (!mValidChannel[cIndex]) { + if (!mValid) { continue; } - auto multi = mIntervals[cIndex]; - auto target = mDistribution[cIndex].data(); + auto multi = mInterval; + auto target = mDistribution.data(); auto dataChannel = dataBatch + c * mHostTensor->stride(1); for (int v = 0; v < area; ++v) { auto data = dataChannel[v]; @@ -150,10 +132,6 @@ void TensorStatistic::setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod) { mThresholdMethod = thresholdMethod; } -void TensorStatistic::setChannelWise(bool mergeChannel) { - mMergeChannel = mergeChannel; -} - int TensorStatistic::_computeThreshold(const std::vector& distribution) { const int targetBinNums = 128; int threshold = targetBinNums; @@ -252,42 +230,21 @@ int TensorStatistic::_computeThreshold(const std::vector& distribution) { return threshold; } -std::vector TensorStatistic::finishAndCompute() { - std::vector scaleValue(mDistribution.size(), 0.0f); - if (mMergeChannel) { - if (!mValidChannel[0]) { - return scaleValue; - } - float sum = 0.0f; - auto& distribution = mDistribution[0]; - std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; }); - std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; }); - - auto threshold = _computeThreshold(distribution); - auto scale = ((float)threshold + 0.5) / mIntervals[0] / mFeatureClampValue; - // MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], scale * mFeatureClampValue); - std::fill(scaleValue.begin(), scaleValue.end(), scale); - mScales = scaleValue; - return scaleValue; +float TensorStatistic::finishAndCompute() { + if (!mValid) { + return 0.f; } - for (int c = 0; c < mDistribution.size(); ++c) { - if (!mValidChannel[c]) { - continue; - } - float sum = 0.0f; - auto& distribution = mDistribution[c]; - std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; }); - std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; }); + float sum = 0.0f; + std::for_each(mDistribution.begin(), mDistribution.end(), [&](float n) { sum += n; }); + std::for_each(mDistribution.begin(), mDistribution.end(), [sum](float& n) { n /= sum; }); - auto threshold = _computeThreshold(distribution); - scaleValue[c] = ((float)threshold + 0.5) / mIntervals[c] / mFeatureClampValue; - } - return scaleValue; + auto threshold = _computeThreshold(mDistribution); + mScale = ((float)threshold + 0.5) / mInterval / mFeatureClampValue; + // MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], mScale * mFeatureClampValue); + return mScale; } -std::vector TensorStatistic::computeScaleADMM() { - std::vector scaleValue(mOriginTensor->channel(), 0.0f); - +float TensorStatistic::computeScaleADMM() { const int count = mOriginTensor->elementSize(); float max = 0; const float bound = mFeatureClampValue; @@ -324,18 +281,16 @@ std::vector TensorStatistic::computeScaleADMM() { alpha = sum1 / sum2; } // DLOG(INFO) << "alpha final: " << alpha; - - std::fill(scaleValue.begin(), scaleValue.end(), alpha); - mScales = scaleValue; + mScale = alpha; mVisited = true; - return scaleValue; + return mScale; } std::pair, float> TensorStatistic::fakeQuantFeature() { const int count = mOriginTensor->elementSize(); const float bound = mFeatureClampValue; float* originData = mOriginTensor->host(); - const float scale = mScales[0]; + const float scale = mScale; std::vector fakeQuantedFeature; int overflowCount = 0; diff --git a/tools/quantization/TensorStatistic.hpp b/tools/quantization/TensorStatistic.hpp index e7bc0f8e..73aa90de 100644 --- a/tools/quantization/TensorStatistic.hpp +++ b/tools/quantization/TensorStatistic.hpp @@ -34,12 +34,11 @@ public: void updateDistribution(); void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod); - void setChannelWise(bool mergeChannel); - std::vector finishAndCompute(); + float finishAndCompute(); // only this one for ADMM - std::vector computeScaleADMM(); + float computeScaleADMM(); std::string name() { return mName; @@ -58,21 +57,27 @@ public: private: int _computeThreshold(const std::vector& distribution); - std::vector> mRangePerChannel; - std::vector mIntervals; - std::vector mValidChannel; - std::vector> mDistribution; + // for every channel for the Tensor + std::pair mRange; + // mBinNumber / maxValue: the number of bin for range 1 + float mInterval; + // if the i-th channel's maxValue > 0.00001f, mValidChannel[i] is true + bool mValid; + // [c * mBinNumber]: store every channel's distribution using bin + std::vector mDistribution; std::shared_ptr mHostTensor; + // the Tensor const MNN::Tensor* mOriginTensor; + // bin number for distribution int mBinNumber; + // has update or not, assert update once bool mUpdatedDistributionFlag = false; bool mUpdatedRangeFlags = false; - bool mMergeChannel = true; std::string mName; GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL; bool mVisited = false; - std::vector mScales; + float mScale; float mFeatureClampValue = 127.0f; }; diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp index 6159cfb8..5be419eb 100644 --- a/tools/quantization/calibration.cpp +++ b/tools/quantization/calibration.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,13 +25,14 @@ #include #include "Helper.hpp" #include "core/TensorUtils.hpp" +#include "cpp/IDSTEncoder.hpp" using namespace MNN::CV; Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int bufferSize, const std::string& configPath) : _originaleModel(model) { // when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1 - int channles = 3; + _channels = 3; rapidjson::Document document; { @@ -59,23 +62,23 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int switch (config.destFormat) { case GRAY: - channles = 1; + _channels = 1; break; case RGB: case BGR: - channles = 3; + _channels = 3; break; case RGBA: case BGRA: - channles = 4; + _channels = 4; break; default: break; } config.sourceFormat = RGBA; - std::string imagePath; - _imageNum = 0; + std::string calibrationFilePath; + _calibrationFileNum = 0; { if (picObj.HasMember("mean")) { auto mean = picObj["mean"].GetArray(); @@ -98,10 +101,13 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int _height = picObj["height"].GetInt(); } if (picObj.HasMember("path")) { - imagePath = picObj["path"].GetString(); + calibrationFilePath = picObj["path"].GetString(); } if (picObj.HasMember("used_image_num")) { - _imageNum = picObj["used_image_num"].GetInt(); + _calibrationFileNum = picObj["used_image_num"].GetInt(); + } + if (picObj.HasMember("used_sample_num")) { + _calibrationFileNum = picObj["used_sample_num"].GetInt(); } if (picObj.HasMember("feature_quantize_method")) { std::string method = picObj["feature_quantize_method"].GetString(); @@ -152,18 +158,99 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int if (picObj.HasMember("debug")) { _debug = picObj["debug"].GetBool(); } + _inputType = InputType::IMAGE; + if (picObj.HasMember("input_type")) { + std::string type = picObj["input_type"].GetString(); + if (type == "sequence") { + _inputType = InputType::SEQUENCE; + } + } } std::shared_ptr process(ImageProcess::create(config)); _process = process; // read images file names - Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum); + Helper::readClibrationFiles(_calibrationFiles, calibrationFilePath.c_str(), &_calibrationFileNum); - _initMNNSession(modelBuffer, bufferSize, channles); + _initMNNSession(modelBuffer, bufferSize); _initMaps(); } -void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) { +std::vector Calibration::_getInputShape(std::string filename) { + std::vector inputShape; + if (_inputType == InputType::IMAGE) { + inputShape.resize(4); + auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat; + if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) { + inputShape[0] = 1; + inputShape[1] = _height; + inputShape[2] = _width; + inputShape[3] = _channels; + } else { + inputShape[0] = 1; + inputShape[1] = _channels; + inputShape[2] = _height; + inputShape[3] = _width; + } + } + if (_inputType == InputType::SEQUENCE) { + if (!Helper::stringEndWith(filename, ".txt")) { + MNN_ERROR("Error: only '.txt' files are supported for sequence input.\n"); + } + + std::ifstream f(filename); + if (!f.is_open()) { + MNN_ERROR("open file %s failed.\n", filename.c_str()); + } + + std::string line; + _channels = 0; + while (std::getline(f, line)) { + std::stringstream ss(line); + float v; + int count = 0; + while (ss >> v) { + count++; + } + if (count > 0) { + _channels++; + _height = count; + } + } + + if (_channels == 0) { + MNN_ERROR("Error: no data found in file %s.", filename.c_str()); + } + + inputShape.resize(3); + auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat; + if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) { + inputShape[0] = 1; + inputShape[1] = _height; + inputShape[2] = _channels; + } else { + inputShape[0] = 1; + inputShape[1] = _channels; + inputShape[2] = _height; + } + } + + return inputShape; +} + +void Calibration::_resizeIfNeeded(std::string filename, bool force) { + std::vector inputShape = _getInputShape(filename); + + if ((inputShape != _inputTensorDims && _featureQuantizeMethod == "KL") || force) { + _inputTensorDims = inputShape; + _interpreter->resizeTensor(_inputTensor, _inputTensorDims); + _interpreter->resizeSession(_session); + _interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims); + _interpreterOrigin->resizeSession(_sessionOrigin); + } +} + +void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize) { _interpreterOrigin.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize)); MNN::ScheduleConfig config; _sessionOrigin = _interpreterOrigin->createSession(config); @@ -181,32 +268,25 @@ void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSi _session = _interpreter->createSession(config); _inputTensor = _interpreter->getSessionInput(_session, NULL); - _inputTensorDims.resize(4); - auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat; - if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) { - _inputTensorDims[0] = 1; - _inputTensorDims[1] = _height; - _inputTensorDims[2] = _width; - _inputTensorDims[3] = channels; - } else { - _inputTensorDims[0] = 1; - _inputTensorDims[1] = channels; - _inputTensorDims[2] = _height; - _inputTensorDims[3] = _width; - } - if (_featureQuantizeMethod == "KL") { - _interpreter->resizeTensor(_inputTensor, _inputTensorDims); - _interpreter->resizeSession(_session); - _interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims); - _interpreterOrigin->resizeSession(_sessionOrigin); - } else if (_featureQuantizeMethod == "ADMM") { - DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM"; - _inputTensorDims[0] = _imageNum; + if (_featureQuantizeMethod == "ADMM") { + DCHECK((_calibrationFileNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM"; + for (auto file : _calibrationFiles) { + std::vector sampleShape = _getInputShape(file); + if (_inputTensorDims.empty()) { + _inputTensorDims = sampleShape; + } + if (sampleShape != _inputTensorDims) { + MNN_ERROR("samples must have the same shape when using ADMM method for sequence inputs."); + } + } + _inputTensorDims[0] = _calibrationFileNum; _interpreter->resizeTensor(_inputTensor, _inputTensorDims); _interpreter->resizeSession(_session); _interpreterOrigin->resizeTensor(_inputTensorOrigin, _inputTensorDims); _interpreterOrigin->resizeSession(_sessionOrigin); } + + _resizeIfNeeded(_calibrationFiles[0]); } void Calibration::_initMaps() { @@ -222,10 +302,10 @@ void Calibration::_initMaps() { return false; } _opInfo[opName].first = nTensors; - if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) { + if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) { int i = 0; for (auto t : nTensors) { - if (_featureInfo.find(t) == _featureInfo.end()) { + if (_featureInfo.find(t) == _featureInfo.end() && MNN::TensorUtils::getDescribe(t)->memoryType != MNN::Tensor::InsideDescribe::MEMORY_VIRTUAL) { _featureInfo[t] = std::shared_ptr( new TensorStatistic(t, _featureQuantizeMethod, opName + " input_tensor_" + flatbuffers::NumToString(i), _featureClampValue)); } @@ -242,7 +322,7 @@ void Calibration::_initMaps() { return true; } _opInfo[opName].second = nTensors; - if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) { + if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) { int i = 0; for (auto t : nTensors) { if (_featureInfo.find(t) == _featureInfo.end()) { @@ -263,7 +343,7 @@ void Calibration::_initMaps() { if (iter != _skip_quant_ops.end()) { return false; } - if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) { + if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) { int i = 0; for (auto t : nTensors) { if (_featureInfoOrigin.find(t) == _featureInfoOrigin.end()) { @@ -282,7 +362,7 @@ void Calibration::_initMaps() { if (iter != _skip_quant_ops.end()) { return true; } - if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) { + if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) { int i = 0; for (auto t : nTensors) { if (_featureInfoOrigin.find(t) == _featureInfoOrigin.end()) { @@ -302,9 +382,11 @@ void Calibration::_initMaps() { } for (int i = 0; i < op->inputIndexes.size(); ++i) { _tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i]; + _tensorIdx[_opInfo[op->name].first[i]] = op->inputIndexes[i]; } for (int i = 0; i < op->outputIndexes.size(); ++i) { _tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i]; + _tensorIdx[_opInfo[op->name].second[i]] = op->outputIndexes[i]; } } @@ -320,7 +402,7 @@ void Calibration::_initMaps() { void Calibration::_computeFeatureMapsRange() { // feed input data according to input images int count = 0; - for (const auto& img : _imgaes) { + for (const auto& file : _calibrationFiles) { for (auto& iter : _featureInfo) { iter.second->setVisited(false); } @@ -329,7 +411,8 @@ void Calibration::_computeFeatureMapsRange() { iter.second->resetUpdatedRangeFlags(); } count++; - Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); + _resizeIfNeeded(file); + Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensor, _inputType); MNN::TensorCallBackWithInfo before = [&](const std::vector& nTensors, const MNN::OperatorInfo* info) { @@ -355,7 +438,7 @@ void Calibration::_computeFeatureMapsRange() { }; _interpreter->runSessionWithCallBackInfo(_session, before, after); - MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum); + MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum); fflush(stdout); } MNN_PRINT("\n"); @@ -387,7 +470,7 @@ void Calibration::_collectFeatureMapsDistribution() { return true; }; int count = 0; - for (const auto& img : _imgaes) { + for (const auto& file : _calibrationFiles) { count++; for (auto& iter : _featureInfo) { @@ -397,10 +480,11 @@ void Calibration::_collectFeatureMapsDistribution() { for (auto& iter : _featureInfo) { iter.second->resetUpdatedDistributionFlag(); } - Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); + _resizeIfNeeded(file); + Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensor, _inputType); _interpreter->runSessionWithCallBackInfo(_session, before, after); - MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum); + MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum); fflush(stdout); } MNN_PRINT("\n"); @@ -429,14 +513,14 @@ void Calibration::_computeFeatureScaleADMM() { dimType = MNN::Tensor::TENSORFLOW; } - for (const auto& img : _imgaes) { + for (const auto& file : _calibrationFiles) { auto curPtr = _inputTensor->host() + count * _inputTensor->stride(0); std::shared_ptr tensorWarp( MNN::Tensor::create(oneImageTensorDims, _inputTensor->getType(), curPtr, dimType)); - Helper::preprocessInput(_process.get(), _width, _height, img, tensorWarp.get()); + Helper::preprocessInput(_process.get(), _width, _height, file, tensorWarp.get(), _inputType); count++; - MNN_PRINT("\rProcessImage: %.2lf %%", (float)count * 100.0f / (float)_imageNum); + MNN_PRINT("\rProcessCalibrationFiles: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum); fflush(stdout); } MNN_PRINT("\n"); @@ -446,7 +530,7 @@ void Calibration::_computeFeatureScaleADMM() { count = 0; MNN::TensorCallBackWithInfo before = [&](const std::vector& nTensors, const MNN::OperatorInfo* info) { - if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) { + if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) { for (auto t : nTensors) { if (_featureInfo.find(t) != _featureInfo.end()) { if (_featureInfo[t]->visited() == false) { @@ -461,7 +545,7 @@ void Calibration::_computeFeatureScaleADMM() { return true; }; MNN::TensorCallBackWithInfo after = [&](const std::vector& nTensors, const MNN::OperatorInfo* info) { - if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) { + if (Helper::gNotNeedFeatureOp.find(info->type()) == Helper::gNotNeedFeatureOp.end()) { for (auto t : nTensors) { if (_featureInfo.find(t) != _featureInfo.end()) { if (_featureInfo[t]->visited() == false) { @@ -480,214 +564,6 @@ void Calibration::_computeFeatureScaleADMM() { MNN_PRINT("\n"); } -void Calibration::_updateScale() { - for (const auto& op : _originaleModel->oplists) { - std::vector::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), op->name); - if (iter != _skip_quant_ops.end()) { - continue; - } - - const auto opType = op->type; - if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise && - opType != MNN::OpType_Eltwise) { - continue; - } - auto tensorsPair = _opInfo.find(op->name); - if (tensorsPair == _opInfo.end()) { - MNN_ERROR("Can't find tensors for %s\n", op->name.c_str()); - } - - if (opType == MNN::OpType_Eltwise) { - auto param = op->main.AsEltwise(); - // Now only support AddInt8 - if (param->type != MNN::EltwiseType_SUM) { - continue; - } - const auto& inputScale0 = _scales[tensorsPair->second.first[0]]; - const auto& inputScale1 = _scales[tensorsPair->second.first[1]]; - const auto& outputScale = _scales[tensorsPair->second.second[0]]; - const int outputScaleSize = outputScale.size(); - std::vector outputInvertScale(outputScaleSize); - Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize); - op->type = MNN::OpType_EltwiseInt8; - op->main.Reset(); - op->main.type = MNN::OpParameter_EltwiseInt8; - - auto eltwiseInt8Param = new MNN::EltwiseInt8T; - auto input0ScaleParam = new MNN::QuantizedFloatParamT; - auto input1ScaleParam = new MNN::QuantizedFloatParamT; - auto outputScaleParam = new MNN::QuantizedFloatParamT; - input0ScaleParam->tensorScale = inputScale0; - input1ScaleParam->tensorScale = inputScale1; - outputScaleParam->tensorScale = outputInvertScale; - eltwiseInt8Param->inputQuan0 = std::unique_ptr(input0ScaleParam); - eltwiseInt8Param->inputQuan1 = std::unique_ptr(input1ScaleParam); - eltwiseInt8Param->outputQuan = std::unique_ptr(outputScaleParam); - op->main.value = eltwiseInt8Param; - - continue; - } - - // below is Conv/DepthwiseConv - const auto& inputScale = _scales[tensorsPair->second.first[0]]; - const auto& outputScale = _scales[tensorsPair->second.second[0]]; - - auto param = op->main.AsConvolution2D(); - const int channles = param->common->outputCount; - const int weightSize = param->weight.size(); - param->symmetricQuan.reset(new MNN::QuantizedFloatParamT); - auto& quantizedParam = param->symmetricQuan; - quantizedParam->scale.resize(channles); - quantizedParam->weight.resize(weightSize); - quantizedParam->bias.resize(channles); - - if (opType == MNN::OpType_Convolution) { - QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(), - quantizedParam->weight.data(), quantizedParam->bias.data(), - quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod, _weightClampValue); - op->type = MNN::OpType_ConvInt8; - - } else if (opType == MNN::OpType_ConvolutionDepthwise) { - QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(), - quantizedParam->weight.data(), quantizedParam->bias.data(), - quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod, _weightClampValue); - op->type = MNN::OpType_DepthwiseConvInt8; - } - if (param->common->relu6) { - param->common->relu = true; - param->common->relu6 = false; - } - param->weight.clear(); - param->bias.clear(); - } -} - -void Calibration::_insertDequantize() { - // Search All Int Tensors - std::set int8Tensors; - std::set int8Outputs; - for (auto& op : _originaleModel->oplists) { - if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) { - for (auto index : op->inputIndexes) { - int8Tensors.insert(index); - } - for (auto index : op->outputIndexes) { - int8Tensors.insert(index); - int8Outputs.insert(index); - } - } - } - for (auto& op : _originaleModel->oplists) { - for (auto index : op->inputIndexes) { - auto iter = int8Outputs.find(index); - if (iter != int8Outputs.end()) { - int8Outputs.erase(iter); - } - } - } - - // Insert Convert For Not Support Int8 Ops - for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) { - auto op = iter->get(); - const auto opType = op->type; - const auto name = op->name; - // check whether is output op - // if Yes, insert dequantization op after this op - if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) { - // this is quantized op - iter++; - continue; - } - - auto& inputIndexes = op->inputIndexes; - const int inputSize = inputIndexes.size(); - - // insert dequantization op before this op - for (int i = 0; i < inputSize; ++i) { - const auto curInputIndex = inputIndexes[i]; - if (int8Tensors.find(curInputIndex) == int8Tensors.end()) { - continue; - } - auto input = _tensorMap[curInputIndex]; - auto inputOpScale = _scales[input]; - - // construct new op - auto dequantizationOp = new MNN::OpT; - dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam; - dequantizationOp->name = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i); - - dequantizationOp->type = MNN::OpType_Int8ToFloat; - auto dequantizationParam = new MNN::QuantizedFloatParamT; - dequantizationOp->main.value = dequantizationParam; - dequantizationParam->tensorScale = inputOpScale; - - dequantizationOp->inputIndexes.push_back(curInputIndex); - dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size()); - _originaleModel->tensorName.push_back(dequantizationOp->name); - - // reset current op's input index at i - inputIndexes[i] = dequantizationOp->outputIndexes[0]; - - iter = _originaleModel->oplists.insert(iter, std::unique_ptr(dequantizationOp)); - iter++; - } - - iter++; - // LOG(INFO) << "insert quantization op after this op if neccessary"; - // insert quantization op after this op if neccessary - for (int i = 0; i < op->outputIndexes.size(); ++i) { - const auto outputIndex = op->outputIndexes[i]; - if (int8Tensors.find(outputIndex) == int8Tensors.end()) { - continue; - } - auto output = _tensorMap[outputIndex]; - auto curScale = _scales[output]; - // construct one quantization op(FloatToInt8) - auto quantizationOp = new MNN::OpT; - quantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam; - quantizationOp->name = name + "___FloatToInt8___" + flatbuffers::NumToString(i); - quantizationOp->type = MNN::OpType_FloatToInt8; - auto quantizationParam = new MNN::QuantizedFloatParamT; - quantizationOp->main.value = quantizationParam; - - const int channels = curScale.size(); - std::vector quantizationScale(channels); - Helper::invertData(quantizationScale.data(), curScale.data(), channels); - quantizationParam->tensorScale = quantizationScale; - - quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size()); - quantizationOp->outputIndexes.push_back(outputIndex); - _originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]); - _originaleModel->tensorName[outputIndex] = quantizationOp->name; - op->outputIndexes[i] = quantizationOp->inputIndexes[0]; - - iter = _originaleModel->oplists.insert(iter, std::unique_ptr(quantizationOp)); - iter++; - } - } - - // Insert Turn float Op for output - for (auto index : int8Outputs) { - // construct new op - auto dequantizationOp = new MNN::OpT; - dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam; - dequantizationOp->name = "___Int8ToFloat___For_" + flatbuffers::NumToString(index); - - dequantizationOp->type = MNN::OpType_Int8ToFloat; - auto dequantizationParam = new MNN::QuantizedFloatParamT; - dequantizationOp->main.value = dequantizationParam; - dequantizationParam->tensorScale = _scales[_tensorMap[index]]; - - dequantizationOp->inputIndexes.push_back(index); - dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size()); - auto originTensorName = _originaleModel->tensorName[index]; - _originaleModel->tensorName[index] = dequantizationOp->name; - _originaleModel->tensorName.emplace_back(originTensorName); - - _originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr(dequantizationOp)); - } -} - void Calibration::_fake_quant_weights() { auto findAbsMax = [&] (const float* weights, const int size) { float absMax = 0; @@ -734,6 +610,60 @@ void Calibration::_fake_quant_weights() { } } } + DLOG(INFO) << "fake quant weights done."; +} + +void Calibration::_insertScale() { + for (const auto iter : _scales) { + std::unique_ptr describe(new MNN::TensorDescribeT); + describe->index = _tensorIdx[iter.first]; + describe->quantInfo.reset(new MNN::TensorQuantInfoT); + describe->quantInfo->scale = iter.second; + describe->quantInfo->type = MNN::DataType_DT_INT8; + _originaleModel->extraTensorDescribe.emplace_back(std::move(describe)); + } + for (const auto& op : _originaleModel->oplists) { + const auto opType = op->type; + + std::vector::iterator iter = std::find(_skip_quant_ops.begin(), _skip_quant_ops.end(), op->name); + if (iter != _skip_quant_ops.end()) { + continue; + } + + if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise) { + continue; + } + auto tensorsPair = _opInfo.find(op->name); + if (tensorsPair == _opInfo.end()) { + MNN_ERROR("Can't find tensors for %s\n", op->name.c_str()); + } + // below is Conv/DepthwiseConv weight quant + const float inputScale = _scales[tensorsPair->second.first[0]]; + const float outputScale = _scales[tensorsPair->second.second[0]]; + const int inputChannel = tensorsPair->second.first[0]->channel(); + const int outputChannel = tensorsPair->second.second[0]->channel(); + auto param = op->main.AsConvolution2D(); + param->common->inputCount = tensorsPair->second.first[0]->channel(); + const int channles = param->common->outputCount; + const int weightSize = param->weight.size(); + param->symmetricQuan.reset(new MNN::QuantizedFloatParamT); + param->symmetricQuan->nbits = 8; + std::vector quantizedWeight(weightSize); + std::vector quantizedWeightScale(outputChannel); + if (_weightQuantizeMethod == "MAX_ABS"){ + SymmetricQuantizeWeight(param->weight.data(), weightSize, quantizedWeight.data(), quantizedWeightScale.data(), outputChannel, _weightClampValue); + } else if (_weightQuantizeMethod == "ADMM") { + QuantizeWeightADMM(param->weight.data(), weightSize, quantizedWeight.data(), quantizedWeightScale.data(), outputChannel, _weightClampValue); + } + param->quanParameter = IDSTEncoder::encode(param->weight, quantizedWeightScale, weightSize/channles, channles, false, quantizedWeight.data(), -_weightClampValue); + param->quanParameter->scaleIn = inputScale; + param->quanParameter->scaleOut = outputScale; + if (param->common->relu6) { + param->common->relu = true; + param->common->relu6 = false; + } + param->weight.clear(); + } } void Calibration::_computeQuantError() { @@ -741,20 +671,18 @@ void Calibration::_computeQuantError() { std::map> overflowRatiosMap; std::map> tensorCosDistanceMap; - std::vector inputShape = {1, _inputTensorDims[1], _inputTensorDims[2], _inputTensorDims[3]}; - _interpreter->resizeTensor(_inputTensor, inputShape); - _interpreter->resizeSession(_session); - _interpreterOrigin->resizeTensor(_inputTensorOrigin, inputShape); - _interpreterOrigin->resizeSession(_sessionOrigin); - - for (const auto& img : _imgaes) { + for (const auto& file : _calibrationFiles) { count++; - Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); + _resizeIfNeeded(file, true); + Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensor, _inputType); std::map> fakeQuantedFeatures; MNN::TensorCallBackWithInfo before = [&](const std::vector& nTensors, const MNN::OperatorInfo* info) { + if (info->type() == "Raster") { + return true; + } for (auto t : nTensors) { if (_featureInfo.find(t) != _featureInfo.end()) { if (_featureInfo[t]->visited() == false) { @@ -786,10 +714,13 @@ void Calibration::_computeQuantError() { _interpreter->runSessionWithCallBackInfo(_session, before, after); - Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensorOrigin); + Helper::preprocessInput(_process.get(), _width, _height, file, _inputTensorOrigin, _inputType); MNN::TensorCallBackWithInfo beforeOrigin = [&](const std::vector& nTensors, const MNN::OperatorInfo* info) { + if (info->type() == "Raster") { + return true; + } for (auto t : nTensors) { if (_featureInfoOrigin.find(t) != _featureInfoOrigin.end()) { if (_featureInfoOrigin[t]->visited() == false) { @@ -821,7 +752,7 @@ void Calibration::_computeQuantError() { _interpreterOrigin->runSessionWithCallBackInfo(_sessionOrigin, beforeOrigin, afterOrigin); - MNN_PRINT("\rcomputeDistance: %.2lf %%", (float)count * 100.0f / (float)_imageNum); + MNN_PRINT("\rcomputeDistance: %.2lf %%", (float)count * 100.0f / (float)_calibrationFileNum); fflush(stdout); } MNN_PRINT("\n\nDebug info:\n\n"); @@ -833,8 +764,8 @@ void Calibration::_computeQuantError() { sumCos += iter.second[i]; sumOverflow += overflowRatiosMap[name][i]; } - float avgCosDistance = sumCos / _imgaes.size(); - float avgOverflowRatio = sumOverflow / _imgaes.size(); + float avgCosDistance = sumCos / _calibrationFiles.size(); + float avgOverflowRatio = sumOverflow / _calibrationFiles.size(); MNN_PRINT("%s: cos distance: %f, overflow ratio: %f\n", name.c_str(), avgCosDistance, avgOverflowRatio); } @@ -849,8 +780,7 @@ void Calibration::runQuantizeModel() { if (_debug) { _computeQuantError(); } - _updateScale(); - _insertDequantize(); + _insertScale(); } void Calibration::dumpTensorScales(const std::string& modelFile) { @@ -891,9 +821,7 @@ void Calibration::dumpTensorScales(const std::string& modelFile) { writer.Key("scales"); writer.StartArray(); - for(auto scale : inputOpScale) { - writer.Double(scale); - } + writer.Double(inputOpScale); writer.EndArray(); writer.EndObject(); @@ -919,9 +847,7 @@ void Calibration::dumpTensorScales(const std::string& modelFile) { writer.Key("scales"); writer.StartArray(); - for(auto scale : outputOpScale) { - writer.Double(scale); - } + writer.Double(outputOpScale); writer.EndArray(); writer.EndObject(); diff --git a/tools/quantization/calibration.hpp b/tools/quantization/calibration.hpp index c0c119ea..bb93513f 100644 --- a/tools/quantization/calibration.hpp +++ b/tools/quantization/calibration.hpp @@ -31,26 +31,34 @@ public: void dumpTensorScales(const std::string& modelFile); + enum InputType { + IMAGE = 0, + SEQUENCE = 1, + }; + private: Calibration(); MNN::NetT* _originaleModel; std::shared_ptr _process; const int _binNums = 2048; - int _imageNum = 0; + int _calibrationFileNum = 0; int _width; int _height; - std::vector _imgaes; + int _channels; + std::vector _calibrationFiles; + InputType _inputType; // Tensor and Info std::map> _featureInfo; std::map> _featureInfoOrigin; std::map _tensorMap; + std::map _tensorIdx; // Op's name, Inputs, Outputs std::map, std::vector>> _opInfo; // The scale results - std::map> _scales; + std::map _scales; std::shared_ptr _interpreter; // keep mnn forward information @@ -70,21 +78,20 @@ private: std::vector _skip_quant_ops; bool _debug = false; - void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels); + std::vector _getInputShape(std::string filename); + void _resizeIfNeeded(std::string filename, bool force = false); + void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize); void _initMaps(); + // compute min/max value for every Tensor void _computeFeatureMapsRange(); void _collectFeatureMapsDistribution(); void _computeFeatureScaleKL(); void _computeFeatureScaleADMM(); void _computeFeatureScaleMoving(); - void _updateScale(); void _fake_quant_weights(); void _computeQuantError(); - - // insert the dequantization op before the not supported op(int8), and insert dequantization op - // after the output op, so that get original float data conveniently - void _insertDequantize(); + void _insertScale(); }; #endif // CALIBRATION_HPP diff --git a/tools/quantization/preprocessConfig.json b/tools/quantization/imageInputConfig.json similarity index 84% rename from tools/quantization/preprocessConfig.json rename to tools/quantization/imageInputConfig.json index f70e2bb8..8bfa6497 100644 --- a/tools/quantization/preprocessConfig.json +++ b/tools/quantization/imageInputConfig.json @@ -12,8 +12,8 @@ ], "width":224, "height":224, - "path":"path/to/images/", - "used_image_num":500, + "path":"path/to/images", + "used_sample_num":500, "feature_quantize_method":"KL", "weight_quantize_method":"MAX_ABS", "feature_clamp_value":127, @@ -21,5 +21,6 @@ "skip_quant_op_names":[ "skip_quant_op_name1", "skip_quant_op_name2" ], + "input_type":"image", "debug":false } diff --git a/tools/quantization/quantizeWeight.cpp b/tools/quantization/quantizeWeight.cpp index 2ba16339..c00a3f12 100644 --- a/tools/quantization/quantizeWeight.cpp +++ b/tools/quantization/quantizeWeight.cpp @@ -133,55 +133,53 @@ int SymmetricQuantizeWeight(const float* weight, const int size, int8_t* quantiz } int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight, - int32_t* quantizedBias, float* scale, const std::vector& inputScale, - const std::vector& outputScale, std::string method, float weightClampValue, bool mergeChannel) { - const int inputChannels = inputScale.size(); - const int outputChannels = outputScale.size(); - const int icXoc = inputChannels * outputChannels; + int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale, + const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel) { + const int icXoc = inputChannel * outputChannel; DCHECK(size % icXoc == 0) << "Input Data Size Error!"; - std::vector quantizedWeightScale(outputChannels); + std::vector quantizedWeightScale(outputChannel); float inputScalexWeight = 1.0f; if (mergeChannel) { if (method == "MAX_ABS"){ - SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue); + SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue); } else if (method == "ADMM") { - QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue); + QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue); } - inputScalexWeight = inputScale[0]; + inputScalexWeight = inputScale; } else { const int kernelSize = size / icXoc; - const int ocStride = size / outputChannels; + const int ocStride = size / outputChannel; std::vector weightMultiByInputScale(size); - for (int oc = 0; oc < outputChannels; ++oc) { - for (int ic = 0; ic < inputChannels; ++ic) { + for (int oc = 0; oc < outputChannel; ++oc) { + for (int ic = 0; ic < inputChannel; ++ic) { for (int i = 0; i < kernelSize; ++i) { const int index = oc * ocStride + ic * kernelSize + i; - weightMultiByInputScale[index] = inputScale[ic] * weight[index]; + weightMultiByInputScale[index] = inputScale * weight[index]; } } } if (method == "MAX_ABS"){ - SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue); + SymmetricQuantizeWeight(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue); } else if (method == "ADMM") { - QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannels, weightClampValue); + QuantizeWeightADMM(weightMultiByInputScale.data(), size, quantizedWeight, quantizedWeightScale.data(), outputChannel, weightClampValue); } } - for (int i = 0; i < outputChannels; ++i) { - if (fabs(outputScale[i]) <= 1e-6) { + for (int i = 0; i < outputChannel; ++i) { + if (fabs(outputScale) <= 1e-6) { scale[i] = 0.0f; } else { - scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale[0]; + scale[i] = inputScalexWeight * quantizedWeightScale[i] / outputScale; } } if (bias) { - for (int i = 0; i < outputChannels; ++i) { + for (int i = 0; i < outputChannel; ++i) { if (fabs(inputScalexWeight) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) { quantizedBias[i] = 0; } else { @@ -194,35 +192,33 @@ int QuantizeConvPerChannel(const float* weight, const int size, const float* bia } int QuantizeDepthwiseConv(const float* weight, const int size, const float* bias, int8_t* quantizedWeight, - int32_t* quantizedBias, float* scale, const std::vector& inputScale, - const std::vector& outputScale, std::string method, float weightClampValue) { - const int inputChannels = inputScale.size(); - const int outputChannels = outputScale.size(); - DCHECK(inputChannels == outputChannels) << "Input Data Size Error!"; + int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale, + const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel) { + DCHECK(inputChannel == outputChannel) << "Input Data Size Error!"; - std::vector quantizedWeightScale(inputChannels); + std::vector quantizedWeightScale(inputChannel); if (method == "MAX_ABS") { - SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels, weightClampValue); + SymmetricQuantizeWeight(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannel, weightClampValue); } else if (method == "ADMM") { - QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannels, weightClampValue); + QuantizeWeightADMM(weight, size, quantizedWeight, quantizedWeightScale.data(), inputChannel, weightClampValue); } - for (int c = 0; c < inputChannels; ++c) { + for (int c = 0; c < inputChannel; ++c) { const int index = c; - if (fabs(outputScale[c]) <= 1e-6) { + if (fabs(outputScale) <= 1e-6) { scale[index] = 0.0f; } else { - scale[index] = inputScale[c] * quantizedWeightScale[c] / outputScale[c]; + scale[index] = inputScale * quantizedWeightScale[c] / outputScale; } } if (bias) { - for (int i = 0; i < outputChannels; ++i) { - if (fabs(inputScale[i]) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) { + for (int i = 0; i < outputChannel; ++i) { + if (fabs(inputScale) <= 1e-6 || fabs(quantizedWeightScale[i]) <= 1e-6) { quantizedBias[i] = 0; } else { - quantizedBias[i] = static_cast(bias[i] / (inputScale[i] * quantizedWeightScale[i])); + quantizedBias[i] = static_cast(bias[i] / (inputScale * quantizedWeightScale[i])); } } } diff --git a/tools/quantization/quantizeWeight.hpp b/tools/quantization/quantizeWeight.hpp index b9db80d2..8cce5d26 100644 --- a/tools/quantization/quantizeWeight.hpp +++ b/tools/quantization/quantizeWeight.hpp @@ -15,16 +15,18 @@ // default: quantize weight every channel int SymmetricQuantizeWeight(const float* weight, const int size, int8_t* quantizedWeight, float* scale, const int channels, float weightClampValue); +int QuantizeWeightADMM(const float* weight, const int weightNum, int8_t* quantizedWeight, float* alpha, + const int kernelNum, const float weightClampValue); // quantize convolution weight per channle // firstly, multiply float weight by input_scale, then quantize the result to get input_sacle*weight_scale // secondly, divide input_sacle*weight_scale by output_scale int QuantizeConvPerChannel(const float* weight, const int size, const float* bias, int8_t* quantizedWeight, - int32_t* quantizedBias, float* scale, const std::vector& inputScale, - const std::vector& outputScale, std::string method, float weightClampValue, bool mergeChannel = true); + int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale, + const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel = true); int QuantizeDepthwiseConv(const float* weight, const int size, const float* bias, int8_t* quantizedWeight, - int32_t* quantizedBias, float* scale, const std::vector& inputScale, - const std::vector& outputScale, std::string method, float weightClampValue); + int32_t* quantizedBias, float* scale, const float inputScale, const float outputScale, + const int inputChannel, const int outputChannel, std::string method, float weightClampValue, bool mergeChannel = true); #endif // QUANTIZEWEIGHT_HPP diff --git a/tools/quantization/sequenceInputConfig.json b/tools/quantization/sequenceInputConfig.json new file mode 100644 index 00000000..dd726217 --- /dev/null +++ b/tools/quantization/sequenceInputConfig.json @@ -0,0 +1,13 @@ +{ + "path":"path/to/txt/files", + "used_sample_num":500, + "feature_quantize_method":"KL", + "weight_quantize_method":"MAX_ABS", + "feature_clamp_value":127, + "weight_clamp_value":127, + "skip_quant_op_names":[ + "skip_quant_op_name1", "skip_quant_op_name2" + ], + "input_type":"sequence", + "debug":false +} diff --git a/tools/script/fastTestTflite.py b/tools/script/fastTestTflite.py new file mode 100644 index 00000000..80973972 --- /dev/null +++ b/tools/script/fastTestTflite.py @@ -0,0 +1,102 @@ +#!/usr/bin/python +import os +import sys +import numpy as np +import tensorflow as tf + +def makeDirForPath(filename): + if filename.find('/') < 0: + return + names = filename.split('/') + dirname = "" + for l in range(0, len(names)-1): + dirname = dirname + names[l] + '/' + if os.path.exists(dirname): + return + os.makedirs(dirname) + +class TestModel(): + def __copy_to_here(self, modelName): + newModel = 'tflite/test.tflite' + print(os.popen("mkdir tflite").read()) + print(os.popen("cp " + modelName + ' ' + newModel).read()) + self.modelName = newModel + self.model = self.__load_graph(self.modelName) + self.inputOps, self.outputOps = self.__analyze_inputs_outputs(self.model) + self.outputs = [output['name'] for output in self.outputOps] + def __init__(self, modelName): + self.__copy_to_here(modelName) + def __run_mnn(self): + result = os.popen("./TestConvertResult Tflite tflite").read() + print(result) + return result + def __load_graph(self, filename): + interpreter = tf.lite.Interpreter(model_path=filename) + interpreter.allocate_tensors() + return interpreter + def __analyze_inputs_outputs(self, graph): + inputs = graph.get_input_details() + outputs = graph.get_output_details() + return (inputs, outputs) + def __get_shape(self, op): + shape = list(op['shape']) + for i in range(len(shape)): + if shape[i] == None or shape[i] < 0: + shape[i] = 1 + else: + shape[i] = int(shape[i]) + return shape + def __run_tflite(self): + jsonDict = {} + jsonDict['inputs'] = [] + jsonDict['outputs'] = [] + inputs = {} + print(self.modelName) + for inputVar in self.inputOps: + inp = {} + inp['name'] = inputVar['name'] + inp['shape'] = self.__get_shape(inputVar) + inputs[inp['name']] = np.random.uniform(0.1, 1.2, inputVar['shape']).astype(inputVar['dtype']) + jsonDict['inputs'].append(inp) + print([output['name'] for output in self.outputOps]) + for output in self.outputOps: + jsonDict['outputs'].append(output['name']) + + import json + jsonString = json.dumps(jsonDict, indent=4) + with open('tflite/input.json', 'w') as f: + f.write(jsonString) + + print('inputs:') + for key in inputs: + print(key) + name = "tflite/" + key + '.txt' + makeDirForPath(name) + f = open(name, 'w') + np.savetxt(f, inputs[key].flatten()) + f.close() + for inp in self.inputOps: + self.model.set_tensor(inp['index'], inputs[inp['name']]) + self.model.invoke() + outputs = [] + for outp in self.outputOps: + outputs.append(self.model.get_tensor(outp['index'])) + print('outputs:') + for i in range(len(outputs)): + outputName = self.outputs[i] + name = 'tflite/' + outputName + '.txt' + print(name) + makeDirForPath(name) + # print(name, outputs[i].shape) + f = open(name, 'w') + np.savetxt(f, outputs[i].flatten()) + f.close() + def Test(self): + self.__run_tflite() + res = self.__run_mnn() + return res + +if __name__ == '__main__': + modelName = sys.argv[1] + t = TestModel(modelName) + t.Test() diff --git a/tools/script/modelTest.py b/tools/script/modelTest.py index 8659ae84..c86264de 100755 --- a/tools/script/modelTest.py +++ b/tools/script/modelTest.py @@ -27,11 +27,11 @@ command = 'testModel.out.exe' if os.name == 'nt' else './testModel.out' root_dir = os.path.join(model_root_dir, 'TestResource') print('root: ' + root_dir + '\n') -# subprocess.Popen is intended to replace os.popen, which is more easy to release resource and safer. -# communicate function will close process automatically def run_cmd(args): - from subprocess import Popen, PIPE, STDOUT - stdout, _ = Popen(args, stdout=PIPE, stderr=STDOUT).communicate() + cmd = args[0] + for i in range(1, len(args)): + cmd += ' ' + args[i] + stdout = os.popen(cmd).read() global total_num total_num += 1 return stdout diff --git a/tools/script/testPTQ.py b/tools/script/testPTQ.py new file mode 100755 index 00000000..6b267767 --- /dev/null +++ b/tools/script/testPTQ.py @@ -0,0 +1,74 @@ +#!/usr/bin/python3 +import sys +import os +import re + +total_num = 0 + +def run_cmd(args): + cmd = args[0] + for i in range(1, len(args)): + cmd += ' ' + args[i] + stdout = os.popen(cmd).read() + return stdout + +def parseRes(res): + pattern = re.compile(r'(\d+, \d+\.\d+)\s') + idxs = set() + avgp = 0 + items = pattern.findall(res) + for item in items: + splitIdx = item.find(',') + idx = int(item[:splitIdx]) + point = float(item[splitIdx+1:]) + idxs.add(idx) + avgp += point + avgp /= len(items) + return idxs, avgp + +def compare(origin, quant): + img_dir = '../resource/images' + for name in os.listdir(img_dir): + origin_res = run_cmd(['./pictureRecognition.out', origin, img_dir + '/' + name]) + quant_res = run_cmd(['./pictureRecognition.out', quant, img_dir + '/' + name]) + # print(origin_res, quant_res) + originIdx, originPoint = parseRes(origin_res) + quantIdx, quantPoint = parseRes(quant_res) + idxRate = len(originIdx & quantIdx) / max(len(originIdx), len(quantIdx)) + pointRate = quantPoint / originPoint + print(name, idxRate, pointRate) + if idxRate < 0.5 or pointRate < 0.5 or pointRate > 2.0: + print('False') + return False + return True + +def test(path): + global total_num + total_num += 1 + originModel = path + '/test.mnn' + quantModel = './__quantModel.mnn' + message = run_cmd(['./quantized.out', originModel, quantModel, path + '/test.json']) + res = True + try: + res = compare(originModel, quantModel) + except: + print('Quant Error!') + res = False + return res + +if __name__ == '__main__': + model_root_dir = sys.argv[1] + root_dir = os.path.join(model_root_dir, 'TestPTQ') + print('root: ' + root_dir + '\n') + gWrong = [] + for name in os.listdir(root_dir): + if name == '.DS_Store': + continue + print(name) + res = test(root_dir + '/' + name) + if not res: + gWrong.append(name) + print('Wrong: %d' %len(gWrong)) + for w in gWrong: + print(w) + print('### Wrong/Total: %d / %d ###'%(len(gWrong), total_num)) diff --git a/tools/train/source/demo/MobilenetV2Utils.cpp b/tools/train/source/demo/MobilenetV2Utils.cpp index 44bf596a..0b77e7be 100644 --- a/tools/train/source/demo/MobilenetV2Utils.cpp +++ b/tools/train/source/demo/MobilenetV2Utils.cpp @@ -32,8 +32,7 @@ using namespace MNN::Train; void MobilenetV2Utils::train(std::shared_ptr model, const int numClasses, const int addToLabel, std::string trainImagesFolder, std::string trainImagesTxt, - std::string testImagesFolder, std::string testImagesTxt, - const int trainQuantDelayEpoch, const int quantBits) { + std::string testImagesFolder, std::string testImagesTxt, const int quantBits) { auto exe = Executor::getGlobalExecutor(); BackendConfig config; exe->setGlobalExecutorConfig(MNN_FORWARD_USER_1, config, 2); @@ -76,11 +75,6 @@ void MobilenetV2Utils::train(std::shared_ptr model, const int numClasses AUTOTIME; trainDataLoader->reset(); model->setIsTraining(true); - // turn float model to quantize-aware-training model after a delay - if (epoch == trainQuantDelayEpoch) { - // turn model to train quant model - std::static_pointer_cast(model)->toTrainQuant(quantBits); - } for (int i = 0; i < trainIterations; i++) { AUTOTIME; auto trainData = trainDataLoader->next(); diff --git a/tools/train/source/demo/MobilenetV2Utils.hpp b/tools/train/source/demo/MobilenetV2Utils.hpp index d3246fcd..67cec793 100644 --- a/tools/train/source/demo/MobilenetV2Utils.hpp +++ b/tools/train/source/demo/MobilenetV2Utils.hpp @@ -16,8 +16,7 @@ class MobilenetV2Utils { public: static void train(std::shared_ptr model, const int numClasses, const int addToLabel, std::string trainImagesFolder, std::string trainImagesTxt, - std::string testImagesFolder, std::string testImagesTxt, - const int trainQuantDelayEpoch = 10, const int quantBits = 8); + std::string testImagesFolder, std::string testImagesTxt, const int quantBits = 8); }; #endif diff --git a/tools/train/source/demo/nnGradTest.cpp b/tools/train/source/demo/nnGradTest.cpp index c3790321..8563ed35 100644 --- a/tools/train/source/demo/nnGradTest.cpp +++ b/tools/train/source/demo/nnGradTest.cpp @@ -308,8 +308,40 @@ public: return 0; } }; +class GatherGradTest : public DemoUnit { +public: + virtual int run(int argc, const char* argv[]) override { + MNN_PRINT("Test grad for Gather\n"); + { + // set input data + const float inpudata[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, + 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21, 0, 22.0, 23.0, 24.0}; + std::vector inputDataRaw(0.0f, sizeof(inpudata) / sizeof(float)); + auto params = _TrainableParam(inputDataRaw.data(), {4, 3, 2}, NCHW, halide_type_of()); + const int indices_data[] = {1, 0, 1, 0}; + const std::vector expectedOutput = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, + 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + std::shared_ptr _m(Module::createEmpty({params})); + std::shared_ptr sgd(new SGD(_m)); + sgd->setLearningRate(0.01f); + for (int i = 0; i < 1000; ++i) { + auto indices = _Const(indices_data, {4}, NCHW, halide_type_of()); + auto output = _GatherV2(params, indices, nullptr); + output = _Reshape(output, {-1}); + auto predictValue = _Const(expectedOutput.data(), {(int)expectedOutput.size()}, NCHW); + auto loss = _ReduceMean(_Square(_Subtract(output, predictValue)), {}); + if (i % 100 == 0) { + MNN_PRINT("Loss = %f\n", loss->readMap()[0]); + } + sgd->step(loss); + } + } + return 0; + } +}; DemoUnitSetRegister(NNGrad, "NNGrad"); DemoUnitSetRegister(NNGradV2, "NNGradV2"); DemoUnitSetRegister(NNGradV3, "NNGradV3"); DemoUnitSetRegister(MatMulGradTest, "MatMulGradTest"); +DemoUnitSetRegister(GatherGradTest, "GatherGradTest"); diff --git a/tools/train/source/grad/GatherGrad.cpp b/tools/train/source/grad/GatherGrad.cpp new file mode 100644 index 00000000..fffd5bed --- /dev/null +++ b/tools/train/source/grad/GatherGrad.cpp @@ -0,0 +1,45 @@ + +// +// GatherGrad.cpp +// MNN +// +// Created by MNN on 2021/02/20. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "OpGrad.hpp" +using namespace std; +using namespace MNN; +using namespace MNN::Express; + +class GatherGrad : public OpGrad { +public: + virtual std::vector onGrad(Express::EXPRP expr, + const std::vector& backwardOutput) override { + auto op = expr->get(); + const auto& inputs = expr->inputs(); + auto param = inputs[0]; + auto indice = inputs[1]; + auto dims = indice->getInfo()->dim; + dims.emplace_back(1); + indice = _Reshape(indice, dims); + int axis = 0; + std::vector res(inputs.size()); + if (inputs.size() > 2) { + axis = inputs[2]->readMap()[0]; + } + if (axis != 0) { + MNN_ERROR("Current's don't support axis != 0 grad for gather\n"); + return res; + } + auto shape = _Shape(param); + res[0] = _ScatterNd(indice, backwardOutput[0], shape); + return res; + } +}; + +static const auto gRegister = []() { + static GatherGrad _c; + OpGrad::insert((int)OpType_GatherV2, &_c); + return true; +}(); diff --git a/tools/train/source/models/Lenet.cpp b/tools/train/source/models/Lenet.cpp index 2af87fc1..6242ba8a 100644 --- a/tools/train/source/models/Lenet.cpp +++ b/tools/train/source/models/Lenet.cpp @@ -37,6 +37,7 @@ std::vector Lenet::onForward(const std::vector& in x = conv2->forward(x); x = _MaxPool(x, {2, 2}, {2, 2}); x = _Reshape(x, {0, -1}); + x = _Convert(x, NCHW); x = ip1->forward(x); x = _Relu(x); x = dropout->forward(x);