mirror of https://github.com/alibaba/MNN.git
[MNN:Sync] Sync Internal 2.8.1
This commit is contained in:
parent
1a5609b861
commit
3b978d9d16
|
@ -489,6 +489,7 @@ IF(MNN_COREML)
|
|||
|
||||
IF(MNN_SEP_BUILD)
|
||||
list(APPEND MNN_DEPS MNNCoreML)
|
||||
list(APPEND MNN_EXTRA_DEPENDS MNNCoreML)
|
||||
ELSE()
|
||||
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCoreML>)
|
||||
ENDIF()
|
||||
|
@ -552,6 +553,7 @@ IF(MNN_OPENCL)
|
|||
IF(MNN_SEP_BUILD)
|
||||
list(APPEND MNN_DEPS MNN_CL)
|
||||
ELSE()
|
||||
add_definitions(-DMNN_OPENCL_ENABLED=1)
|
||||
list(APPEND MNN_TARGETS MNN_CL)
|
||||
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_CL>)
|
||||
list(APPEND MNN_EXTRA_DEPENDS ${MNN_OCL_LIBS})
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
Pod::Spec.new do |s|
|
||||
s.name = "MNN"
|
||||
s.version = "2.2.0"
|
||||
s.summary = "MNN"
|
||||
|
||||
s.description = <<-DESC
|
||||
MNN is a lightweight deep neural network inference framework. It loads models and do inference on devices.
|
||||
DESC
|
||||
|
||||
s.homepage = "https://github.com/alibaba/MNN"
|
||||
s.license = {
|
||||
:type => 'Apache License, Version 2.0',
|
||||
:text => <<-LICENSE
|
||||
Copyright © 2018, Alibaba Group Holding Limited
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
LICENSE
|
||||
}
|
||||
|
||||
s.author = { "MNN" => "MNN@alibaba-inc.com" }
|
||||
s.platform = :ios
|
||||
s.ios.deployment_target = '8.0'
|
||||
s.requires_arc = true
|
||||
|
||||
#s.source = { :git => "git@github.com:alibaba/MNN.git", :branch => 'master' }
|
||||
s.source = {:git => "/Users/zhang/Development/AliNNPrivate/",:branch=> 'head'}
|
||||
s.frameworks = 'Metal', 'Accelerate', 'CoreML'
|
||||
s.library = 'c++'
|
||||
s.source_files = \
|
||||
'include/MNN/*.{h,hpp}',\
|
||||
'include/MNN/expr/*.{h,hpp}',\
|
||||
'schema/current/*.{h}',\
|
||||
'3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
|
||||
'source/internal/logging/*.{hpp,cpp}',\
|
||||
'source/internal/logging/ios/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/internal/logging/aliyun-log-c-sdk/src/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/common/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/utils/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'source/shape/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/shape/render/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
#'source/backend/arm82/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
#'source/backend/arm82/asm/**/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
'source/backend/cpu/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
'source/backend/cpu/render/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
'source/backend/cpu/bf16/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
'source/backend/cpu/arm/**/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
'source/backend/cpu/compute/*.{h,c,m,mm,cc,S,hpp,cpp}',\
|
||||
'source/backend/metal/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'source/backend/metal/render/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'source/backend/coreml/backend/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'source/backend/coreml/execution/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'source/backend/coreml/mlmodel/src/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'express/**/*.{hpp,cpp}',\
|
||||
'tools/cv/include/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'tools/cv/source/imgproc/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
'tools/cv/source/calib3d/*.{h,c,m,mm,cc,hpp,cpp,metal}'
|
||||
|
||||
s.header_mappings_dir = 'include'
|
||||
s.subspec 'cv' do |sp|
|
||||
sp.source_files = 'tools/cv/include/**/*.hpp'
|
||||
sp.header_mappings_dir = 'tools/cv/include'
|
||||
sp.xcconfig = { 'ALWAYS_SEARCH_USER_PATHS' => 'NO' }
|
||||
end
|
||||
|
||||
s.compiler_flags = '-arch arm64 -march=armv8.2-a+simd+fp16'
|
||||
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half" "$(PODS_TARGET_SRCROOT)/source/backend/coreml/mlmodel/include" "$(PODS_TARGET_SRCROOT)/tools/cv/include"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1 MNN_METAL_FULL_PRECISION=1 MNN_SUPPORT_RENDER=1 MNN_SUPPORT_BF16=1 MNN_COREML_ENABLED=1 USE_LZ4_FLAG=1 MNN_INTERNAL_ENABLED=1 MNN_USE_SPARSE_COMPUTE=1'}
|
||||
s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
|
||||
end
|
|
@ -55,14 +55,10 @@
|
|||
- `checkInvalidValue.out` 检测输出目录里的数据
|
||||
- `timeProfile.out` 测试模型在指定后端上执行的时间,并获取每层的执行时间占比
|
||||
- `testTrain.out` 测试训练功能
|
||||
- `aoa_nlu_encoder.out` 测试NLU编码
|
||||
- `aoa_nlu_decoder1.out` 测试NLU解码1
|
||||
- `aoa_nlu_decoder2.out` 测试NLU解码2
|
||||
- `checkDir.out` 测试两个文件夹是否一致
|
||||
- `checkFile.out` 测试两个文件是否一致
|
||||
- `winogradExample.out` winograd示例
|
||||
- `winogradGenerateGLSL.out` winograd生成GLSL
|
||||
- `winogradGenerateCL.out` winograd生成CL
|
||||
- `fuseTest` 测试 GPU 自定义算子的功能,目前仅支持 Vulkan Buffer 模式
|
||||
## Benchmark工具
|
||||
- 相关编译选项
|
||||
- `MNN_BUILD_BENCHMARK` 是否编译Benchmark工具
|
||||
|
|
|
@ -2195,6 +2195,25 @@ array([[[[0., 1.]],
|
|||
[[6., 7.]]]], dtype=float32)
|
||||
```
|
||||
|
||||
---
|
||||
### `reverse(x, axis)`
|
||||
在输入x变量在axis[0]维度进行翻转
|
||||
|
||||
参数:
|
||||
- `x : var_like` 输入变量
|
||||
- `axis : var_like` 输入变量
|
||||
|
||||
返回:反转序列的值
|
||||
|
||||
返回类型:`Var`
|
||||
|
||||
示例:
|
||||
|
||||
```python
|
||||
>>> expr.reverse(expr.range(-4., 4., 1.), [0])
|
||||
array([ 3., 2., 1., 0., -1., -2., -3., -4.], dtype=float32)
|
||||
```
|
||||
|
||||
---
|
||||
### `reverse_sequence(x, y, batch_dim, seq_dim)`
|
||||
沿着batch_dim维度对x进行切片并反转维度seq_dim上的y[i]元素
|
||||
|
|
|
@ -457,3 +457,14 @@ Matrix:
|
|||
0.0000000 0.0000000 1.0000000
|
||||
```
|
||||
|
||||
## fuseTest
|
||||
### 功能
|
||||
测试 GPU 自定义算子的功能,目前仅支持 Vulkan Buffer 模式
|
||||
|
||||
### 参数
|
||||
`Usage: ./fuseTest user.spirv config.json`
|
||||
- `user.spirv:str`:SPIRV文件路径,可以用 glslangValidator -V user.comp -o user.spirv 编译获得
|
||||
- `config.json:str`: 配置文件路径
|
||||
### 示例
|
||||
```bash
|
||||
$ ./fuseTest user.spirv user.json
|
||||
|
|
|
@ -120,7 +120,7 @@ Executor::Requirement Executor::getRequirement(Expr* expr) const {
|
|||
return req;
|
||||
}
|
||||
for (int i = 0; i < inputSize; ++i) {
|
||||
req.contentNeedContent[i] = OpCommonUtils::opNeedContent(op->type(), i);
|
||||
req.contentNeedContent[i] = OpCommonUtils::opNeedContent(op, i);
|
||||
req.shapeNeedContent[i] = false;
|
||||
}
|
||||
auto needIndexId = SizeComputer::needInputContent(op, inputSize);
|
||||
|
|
|
@ -192,6 +192,17 @@ EXPRP Expr::create(std::shared_ptr<BufferStorage> extra, std::vector<VARP>&& inp
|
|||
EXPRP expr(new Expr(outputSize));
|
||||
expr->mStorage = extra;
|
||||
expr->mOp = flatbuffers::GetRoot<Op>(extra->buffer());
|
||||
switch (expr->mOp->type()) {
|
||||
case OpType_Const:
|
||||
expr->mType = VARP::CONSTANT;
|
||||
break;
|
||||
case OpType_TrainableParam:
|
||||
expr->mType = VARP::TRAINABLE;
|
||||
break;
|
||||
default:
|
||||
expr->mType = VARP::INPUT;
|
||||
break;
|
||||
}
|
||||
expr->mInputs = std::move(inputs);
|
||||
auto exe = ExecutorScope::Current();
|
||||
expr->mInside->mReq = exe->getRequirement(expr.get());
|
||||
|
|
|
@ -626,6 +626,13 @@ VARP _ChannelShuffle(VARP x, int group) {
|
|||
x = _Convert(x, NC4HW4);
|
||||
return x;
|
||||
}
|
||||
|
||||
VARP _Reverse(VARP x, VARP axis) {
|
||||
std::unique_ptr<MNN::OpT> op(new MNN::OpT);
|
||||
op->type = MNN::OpType_Reverse;
|
||||
return (Variable::create(Expr::create(op.get(), {x, axis})));
|
||||
}
|
||||
|
||||
VARP _ReverseSequence(VARP x, VARP y, int batchDim, int seqDim) {
|
||||
std::unique_ptr<OpT> op(new OpT);
|
||||
op->type = OpType_ReverseSequence;
|
||||
|
@ -1710,19 +1717,10 @@ VARP _GridSample(VARP input, VARP grid, InterpolationMethod mode, GridSamplePadd
|
|||
}
|
||||
|
||||
VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue/*For future*/) {
|
||||
auto xInfo = x->getInfo();
|
||||
auto scaleInfo = scale->getInfo();
|
||||
auto scalePtr = scale->readMap<float>();
|
||||
if (nullptr == scalePtr || nullptr == xInfo || nullptr == scaleInfo) {
|
||||
MNN_ERROR("Error for FloatToInt8 because var not ready\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (xInfo->order != NC4HW4 || xInfo->type.code != halide_type_float) {
|
||||
MNN_ERROR("Not Support Input for FloatToInt8 because var not NC4HW4 or not float\n");
|
||||
return nullptr;
|
||||
}
|
||||
if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) {
|
||||
MNN_ERROR("Scale's size not match input's channel: %d - %d\n", scaleInfo->size, xInfo->dim[1]);
|
||||
if (nullptr == scalePtr || nullptr == scaleInfo) {
|
||||
MNN_ERROR("Error for FloatToInt8 because scale not ready\n");
|
||||
return nullptr;
|
||||
}
|
||||
std::unique_ptr<OpT> op(new OpT);
|
||||
|
@ -1735,21 +1733,12 @@ VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue
|
|||
}
|
||||
|
||||
VARP _FloatToInt8(VARP x, VARP scale, int8_t minValue, int8_t maxValue, int8_t zeroPoint) {
|
||||
auto xInfo = x->getInfo();
|
||||
auto scaleInfo = scale->getInfo();
|
||||
auto scalePtr = scale->readMap<float>();
|
||||
if (nullptr == scalePtr || nullptr == xInfo || nullptr == scaleInfo) {
|
||||
if (nullptr == scalePtr || nullptr == scaleInfo) {
|
||||
MNN_ERROR("Error for FloatToInt8 because var not ready\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (xInfo->order != NC4HW4 || xInfo->type.code != halide_type_float) {
|
||||
MNN_ERROR("Not Support Input for FloatToInt8 because var not NC4HW4 or not float\n");
|
||||
return nullptr;
|
||||
}
|
||||
if ((scaleInfo->size != xInfo->dim[1]) && (scaleInfo->size != 1)) {
|
||||
MNN_ERROR("Scale's size not match input's channel: %d - %d\n", scaleInfo->size, xInfo->dim[1]);
|
||||
return nullptr;
|
||||
}
|
||||
std::unique_ptr<OpT> op(new OpT);
|
||||
op->type = OpType_FloatToInt8;
|
||||
op->main.type = OpParameter_QuantizedFloatParam;
|
||||
|
|
|
@ -58,6 +58,10 @@ ExprModule::ExprModule(EXPRP expr) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
// TODO: Optimize the logic
|
||||
if (!mExpr->mCanDecompose) {
|
||||
ExecutorScope::Current()->setLazyComputeMode(Executor::LAZY_CONTENT);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<VARP> ExprModule::onForward(const std::vector<VARP>& inputs) {
|
||||
|
@ -72,6 +76,14 @@ std::vector<VARP> ExprModule::onForward(const std::vector<VARP>& inputs) {
|
|||
std::vector<VARP> outputVars;
|
||||
auto newExpr = Expr::create(mExpr->extra(), std::move(tempInputs), mExpr->outputSize());
|
||||
newExpr->setName(mExpr->name());
|
||||
if (!mExpr->mCanDecompose) {
|
||||
// Set tensor shape from net
|
||||
newExpr->mCanDecompose = false;
|
||||
for (int index = 0; index < mExpr->outputSize(); ++index) {
|
||||
TensorUtils::copyShape(mExpr->inside()->mOutputTensors[index], newExpr->inside()->mOutputTensors[index], true, true);
|
||||
Utils::copyTensorToInfo(newExpr->inside()->mOutputInfos.data() + index, newExpr->inside()->mOutputTensors[index]);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < mExpr->outputSize(); ++i) {
|
||||
outputVars.emplace_back(Variable::create(newExpr, i));
|
||||
}
|
||||
|
@ -562,6 +574,23 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
|||
config = &defaultConfig;
|
||||
}
|
||||
auto subGraphs = net->subgraphs();
|
||||
if (config->dynamic) {
|
||||
// TODO: Support subgraph
|
||||
if (nullptr == subGraphs) {
|
||||
auto varMap = MNN::Express::Variable::loadMap(buffer, length);
|
||||
std::vector<MNN::Express::VARP> inputsVar(inputs.size());
|
||||
for (int i=0; i<inputs.size(); ++i) {
|
||||
inputsVar[i] = varMap[inputs[i]];
|
||||
}
|
||||
std::vector<MNN::Express::VARP> outputsVar(outputs.size());
|
||||
for (int i=0; i<outputs.size(); ++i) {
|
||||
outputsVar[i] = varMap[outputs[i]];
|
||||
}
|
||||
return extract(inputsVar, outputsVar, false);
|
||||
} else {
|
||||
MNN_ERROR("Don't support subgraph for dynamic load, turn back to static load\n");
|
||||
}
|
||||
}
|
||||
std::map<std::string, SubGraph> subGraphMap;
|
||||
_createSubGraph(net, rtMgr, config, subGraphMap);
|
||||
std::shared_ptr<BufferStorage> bufferStorage(new BufferStorage);
|
||||
|
|
|
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
|||
#define STR(x) STR_IMP(x)
|
||||
#define MNN_VERSION_MAJOR 2
|
||||
#define MNN_VERSION_MINOR 8
|
||||
#define MNN_VERSION_PATCH 0
|
||||
#define MNN_VERSION_PATCH 1
|
||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||
#endif /* MNNDefine_h */
|
||||
|
|
|
@ -24,6 +24,15 @@ struct MNNVulkanContext {
|
|||
uint32_t iQueueFamilyIndex;
|
||||
};
|
||||
|
||||
struct MNNVulkanTensorContent {
|
||||
VkBuffer buffer;
|
||||
VkDeviceSize size;
|
||||
VkDeviceSize offset;
|
||||
|
||||
halide_type_t realType;
|
||||
int32_t mask; // For future usage
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef MNN_METAL
|
||||
|
@ -36,6 +45,9 @@ struct MNNMetalTensorContent {
|
|||
id<MTLBuffer> buffer;
|
||||
int32_t offset;
|
||||
id<MTLTexture> texture;
|
||||
|
||||
halide_type_t type;
|
||||
int32_t mask;
|
||||
int32_t forFuture[8];
|
||||
};
|
||||
|
||||
|
|
|
@ -275,6 +275,12 @@ public:
|
|||
mBuffer.dim[index].extent = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief For GPU and Other Device, get memory directly, see MNNSharedContext for detail
|
||||
* @return Success or not. If type != tensor's backend's type or type is cpu , return false
|
||||
*/
|
||||
bool getDeviceInfo(void* dst, int forwardType) const;
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief print tensor data. for DEBUG use only.
|
||||
|
|
|
@ -267,6 +267,7 @@ private:
|
|||
bool mVisited = false;
|
||||
std::vector<WeakEXPRP> mTo;
|
||||
bool mCanDecompose = true;
|
||||
friend class ExprModule;
|
||||
|
||||
};
|
||||
} // namespace Express
|
||||
|
|
|
@ -77,6 +77,7 @@ MNN_PUBLIC VARP _ChangeInputFormat(VARP input, Dimensionformat format);
|
|||
MNN_PUBLIC VARP _Conv2DBackPropFilter(VARP input, VARP inputGrad, INTS kernelSize, PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
|
||||
MNN_PUBLIC VARP _PoolGrad(VARP originInput, VARP originOutput, VARP inputGrad, INTS kernel, INTS stride, PoolingMode type, PaddingMode pad = VALID, INTS pads= {0, 0});
|
||||
// FIXME: move the api to Array Ops
|
||||
MNN_PUBLIC VARP _Reverse(VARP x, VARP axis);
|
||||
MNN_PUBLIC VARP _ReverseSequence(VARP x, VARP y, int batchDim, int seqDim);
|
||||
// FIXME: move the api to Image Ops
|
||||
MNN_PUBLIC VARP _Crop(VARP images, VARP size, int axis, INTS offset);
|
||||
|
|
|
@ -1,64 +0,0 @@
|
|||
//
|
||||
// cli_demo.cpp
|
||||
//
|
||||
// Created by MNN on 2023/03/24.
|
||||
// ZhaodeWang
|
||||
//
|
||||
|
||||
#include "llm.hpp"
|
||||
#include <fstream>
|
||||
#include <stdlib.h>
|
||||
|
||||
void benchmark(Llm* llm, std::string prompt_file) {
|
||||
std::cout << "prompt file is " << prompt_file << std::endl;
|
||||
std::ifstream prompt_fs(prompt_file);
|
||||
std::vector<std::string> prompts;
|
||||
std::string prompt;
|
||||
while (std::getline(prompt_fs, prompt)) {
|
||||
// prompt start with '#' will be ignored
|
||||
if (prompt.substr(0, 1) == "#") {
|
||||
continue;
|
||||
}
|
||||
prompts.push_back(prompt);
|
||||
}
|
||||
int prompt_len = 0;
|
||||
int decode_len = 0;
|
||||
int64_t prefill_time = 0;
|
||||
int64_t decode_time = 0;
|
||||
// llm->warmup();
|
||||
for (int i = 0; i < prompts.size(); i++) {
|
||||
llm->response(prompts[i]);
|
||||
prompt_len += llm->prompt_len_;
|
||||
decode_len += llm->gen_seq_len_;
|
||||
prefill_time += llm->prefill_us_;
|
||||
decode_time += llm->decode_us_;
|
||||
llm->reset();
|
||||
}
|
||||
float prefill_s = prefill_time / 1e6;
|
||||
float decode_s = decode_time / 1e6;
|
||||
printf("\n#################################\n");
|
||||
printf("prompt tokens num = %d\n", prompt_len);
|
||||
printf("decode tokens num = %d\n", decode_len);
|
||||
printf("prefill time = %.2f s\n", prefill_s);
|
||||
printf(" decode time = %.2f s\n", decode_s);
|
||||
printf("prefill speed = %.2f tok/s\n", prompt_len / prefill_s);
|
||||
printf(" decode speed = %.2f tok/s\n", decode_len / decode_s);
|
||||
printf("##################################\n");
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) {
|
||||
if (argc < 2) {
|
||||
std::cout << "Usage: " << argv[0] << " model_dir <prompt.txt>" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
std::string model_dir = argv[1];
|
||||
std::cout << "model path is " << model_dir << std::endl;
|
||||
std::unique_ptr<Llm> llm(Llm::createLLM(model_dir));
|
||||
llm->load(model_dir);
|
||||
if (argc < 3) {
|
||||
llm->chat();
|
||||
}
|
||||
std::string prompt_file = argv[2];
|
||||
benchmark(llm.get(), prompt_file);
|
||||
return 0;
|
||||
}
|
|
@ -11,8 +11,10 @@
|
|||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <iostream>
|
||||
#include <streambuf>
|
||||
#include <functional>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <MNN/AutoTime.hpp>
|
||||
#include <MNN/expr/Expr.hpp>
|
||||
|
@ -25,6 +27,25 @@ using namespace MNN;
|
|||
using namespace Express;
|
||||
class Tokenizer;
|
||||
|
||||
// llm stream buffer with callback
|
||||
|
||||
class LlmStreamBuffer : public std::streambuf {
|
||||
public:
|
||||
using CallBack = std::function<void(const char* str, size_t len)>;;
|
||||
LlmStreamBuffer(CallBack callback) : callback_(callback) {}
|
||||
|
||||
protected:
|
||||
virtual std::streamsize xsputn(const char* s, std::streamsize n) override {
|
||||
if (callback_) {
|
||||
callback_(s, n);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
private:
|
||||
CallBack callback_ = nullptr;
|
||||
};
|
||||
|
||||
class MNN_PUBLIC Llm {
|
||||
public:
|
||||
Llm() {
|
||||
|
|
|
@ -80,9 +80,8 @@ public:
|
|||
virtual std::vector<int> encode(const std::string& str) override;
|
||||
virtual std::string decode(int id) override;
|
||||
private:
|
||||
std::unordered_map<std::string, int> encoder_;
|
||||
std::vector<std::string> decoder_;
|
||||
std::vector<int> tokens_;
|
||||
std::vector<int> token_ids_;
|
||||
};
|
||||
|
||||
#endif // TOKENIZER_hpp
|
|
@ -106,8 +106,7 @@ std::string Llm::response(const std::string& query, std::ostream* os, const char
|
|||
history_ = input_ids;
|
||||
}
|
||||
|
||||
prompt_len_ = input_ids.size();
|
||||
// printf("token_num : %lu\n", input_ids.size());
|
||||
prompt_len_ = static_cast<int>(input_ids.size());
|
||||
auto st = std::chrono::system_clock::now();
|
||||
int token = forward(input_ids);
|
||||
auto et = std::chrono::system_clock::now();
|
||||
|
@ -168,20 +167,22 @@ void Llm::load(const std::string& model_dir) {
|
|||
config.type = MNN_FORWARD_CPU;
|
||||
// config.type = MNN_FORWARD_OPENCL;
|
||||
config.numThread = 4;
|
||||
// cpuBackendConfig.precision = BackendConfig::Precision_Low;
|
||||
cpuBackendConfig.precision = BackendConfig::Precision_Low;
|
||||
cpuBackendConfig.memory = BackendConfig::Memory_Low;
|
||||
config.backendConfig = &cpuBackendConfig;
|
||||
runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
|
||||
if (config.type == MNN_FORWARD_OPENCL) {
|
||||
const char* cacheFileName = ".tempcache";
|
||||
runtime_manager_->setCache(cacheFileName);
|
||||
// runtime_manager_->setCache(cacheFileName);
|
||||
}
|
||||
load_progress_ = 0.f;
|
||||
printf("load tokenizer\n");
|
||||
// 1. load vocab
|
||||
std::string tokenizer_path = model_dir + "/tokenizer.txt";
|
||||
load_progress_ += 5.f;
|
||||
tokenizer_->load(tokenizer_path);
|
||||
load_progress_ += 5.f;
|
||||
printf("load tokenizer Done\n");
|
||||
// 2. load model
|
||||
Module::Config module_config;
|
||||
module_config.shapeMutable = true;
|
||||
|
@ -228,7 +229,7 @@ void Llm::load(const std::string& model_dir) {
|
|||
}
|
||||
}
|
||||
if (config.type == MNN_FORWARD_OPENCL) {
|
||||
warmup();
|
||||
// warmup();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -369,8 +370,10 @@ bool Chatglm_6b::is_stop(int token_id) {
|
|||
std::vector<int> Chatglm2_6b::tokenizer(const std::string& query) {
|
||||
auto prompt = "问:" + query + "\n答:";
|
||||
auto ids = tokenizer_encode(prompt);
|
||||
if (history_.empty()) {
|
||||
ids.insert(ids.begin(), 64792);
|
||||
ids.insert(ids.begin(), 64790);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
|
|
|
@ -307,81 +307,48 @@ const int CHARACTER_VOCABULARY_SIZE = 256;
|
|||
|
||||
bool Tiktoken::load(const std::string& filename) {
|
||||
std::ifstream tok_file(filename);
|
||||
int index = -1, start = 0, rough_count = 0;
|
||||
std::string token;
|
||||
while (tok_file >> token) {
|
||||
token = base64_decode(token);
|
||||
encoder_[token] = static_cast<int>(decoder_.size());
|
||||
decoder_.push_back(token);
|
||||
rough_count += token.size();
|
||||
}
|
||||
tok_file.close();
|
||||
tokens_.resize(rough_count * CHARACTER_VOCABULARY_SIZE, -1);
|
||||
token_ids_.resize(rough_count * CHARACTER_VOCABULARY_SIZE, -1);
|
||||
for (int n = 0; n < decoder_.size(); n++) {
|
||||
token = decoder_[n];
|
||||
int root = 0;
|
||||
for (int i = 0; i < token.size(); i++) {
|
||||
unsigned char x = token[i];
|
||||
// record the token id at the parent of leaf node
|
||||
if (i == token.size() - 1) {
|
||||
token_ids_[root + x] = n;
|
||||
}
|
||||
// trace down a tree node.
|
||||
// insert a subtree when needed.
|
||||
if (tokens_[root + x] == -1) {
|
||||
start += CHARACTER_VOCABULARY_SIZE;
|
||||
tokens_[root + x] = start;
|
||||
root = start;
|
||||
} else {
|
||||
root = tokens_[root + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens_.resize(start + CHARACTER_VOCABULARY_SIZE);
|
||||
token_ids_.resize(start + CHARACTER_VOCABULARY_SIZE);
|
||||
tokens_.shrink_to_fit();
|
||||
token_ids_.shrink_to_fit();
|
||||
return true;
|
||||
}
|
||||
|
||||
// ref: https://github.com/youkaichao/fast_bpe_tokenizer
|
||||
std::vector<int> Tiktoken::encode(const std::string& str) {
|
||||
std::vector<int> ids;
|
||||
if (str.empty()) {
|
||||
return ids;
|
||||
}
|
||||
int i = 0;
|
||||
int root = 0;
|
||||
int root_token_id = -1;
|
||||
int last_found_position = -1;
|
||||
int last_found_token_id = -1;
|
||||
size_t i = 0;
|
||||
while (i < str.size()) {
|
||||
unsigned char x = str[i];
|
||||
bool should_fall_back = false;
|
||||
if (tokens_[root + x] != -1) {
|
||||
root_token_id = token_ids_[root + x];
|
||||
root = tokens_[root + x];
|
||||
if (root_token_id != -1) {
|
||||
// a token ends at position i
|
||||
last_found_position = i;
|
||||
last_found_token_id = root_token_id;
|
||||
bool found_pair = false;
|
||||
// Attempt to match the longest possible symbol
|
||||
size_t longest_match_len = 0;
|
||||
std::string longest_match;
|
||||
|
||||
// Check substrings of decreasing length
|
||||
for (size_t len = str.size() - i; len > 0; --len) {
|
||||
std::string token = str.substr(i, len);
|
||||
auto it = encoder_.find(token);
|
||||
if (it != encoder_.end()) {
|
||||
if (len > longest_match_len) {
|
||||
longest_match_len = len;
|
||||
longest_match = it->first;
|
||||
}
|
||||
i++;
|
||||
if (i == str.size()) {
|
||||
should_fall_back = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!longest_match.empty()) {
|
||||
ids.push_back(encoder_.at(longest_match));
|
||||
i += longest_match_len;
|
||||
} else {
|
||||
// assert(last_found_position != -1);
|
||||
should_fall_back = true;
|
||||
}
|
||||
if (should_fall_back) {
|
||||
i = last_found_position + 1;
|
||||
ids.push_back(last_found_token_id);
|
||||
// start searching from the root again
|
||||
root = 0;
|
||||
root_token_id = -1;
|
||||
last_found_position = -1;
|
||||
last_found_token_id = -1;
|
||||
// If no matching symbol is found, this typically means an error in the encoding
|
||||
// or the input text contains characters that the encoder doesn't know how to handle
|
||||
std::cerr << "Error: No encoding found for the sequence starting at position " << i << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
return ids;
|
||||
|
|
|
@ -8,6 +8,7 @@ adb push ./libMNN_Vulkan.so /data/local/tmp/$DIR/libMNN_Vulkan.so
|
|||
adb push ./libMNN_GL.so /data/local/tmp/$DIR/libMNN_GL.so
|
||||
adb push ./libMNN_Express.so /data/local/tmp/$DIR/libMNN_Express.so
|
||||
adb push ./MNNV2Basic.out /data/local/tmp/$DIR/MNNV2Basic.out
|
||||
adb push ./ModuleBasic.out /data/local/tmp/$DIR/ModuleBasic.out
|
||||
adb shell "cd /data/local/tmp/$DIR && rm -r output"
|
||||
adb shell "cd /data/local/tmp/$DIR && mkdir output"
|
||||
adb push ./unitTest.out /data/local/tmp/$DIR/unitTest.out
|
||||
|
|
|
@ -163,14 +163,12 @@
|
|||
4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||
4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
|
||||
489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */; };
|
||||
489D7A6E2550FDC800AD896A /* MetalROIPooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */; };
|
||||
489D7A6F2550FDC800AD896A /* MetalCast.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A1E2550FDC800AD896A /* MetalCast.mm */; };
|
||||
489D7A702550FDC800AD896A /* MetalRaster.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A1F2550FDC800AD896A /* MetalRaster.hpp */; };
|
||||
489D7A722550FDC800AD896A /* MetalReLU6.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A212550FDC800AD896A /* MetalReLU6.hpp */; };
|
||||
489D7A732550FDC800AD896A /* MetalBackend.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A222550FDC800AD896A /* MetalBackend.hpp */; };
|
||||
489D7A762550FDC800AD896A /* MetalReduction.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A252550FDC800AD896A /* MetalReduction.mm */; };
|
||||
489D7A772550FDC800AD896A /* MetalConvolutionGEMM.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A262550FDC800AD896A /* MetalConvolutionGEMM.mm */; };
|
||||
489D7A782550FDC800AD896A /* MetalEltwise.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A272550FDC800AD896A /* MetalEltwise.mm */; };
|
||||
489D7A792550FDC800AD896A /* MetalConvolution1x1.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A282550FDC800AD896A /* MetalConvolution1x1.mm */; };
|
||||
489D7A7B2550FDC800AD896A /* MetalUnary.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A2A2550FDC800AD896A /* MetalUnary.hpp */; };
|
||||
|
@ -206,7 +204,6 @@
|
|||
489D7AA72550FDC900AD896A /* MetalScale.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A562550FDC800AD896A /* MetalScale.hpp */; };
|
||||
489D7AA82550FDC900AD896A /* MetalCast.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A572550FDC800AD896A /* MetalCast.hpp */; };
|
||||
489D7AAF2550FDC900AD896A /* MetalConvolutionWinograd.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A5E2550FDC800AD896A /* MetalConvolutionWinograd.mm */; };
|
||||
489D7AB02550FDC900AD896A /* MetalDefine.h in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A5F2550FDC800AD896A /* MetalDefine.h */; };
|
||||
489D7AB32550FDC900AD896A /* MetalPReLU.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A622550FDC800AD896A /* MetalPReLU.mm */; };
|
||||
489D7AB42550FDC900AD896A /* MetalBinary.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A632550FDC800AD896A /* MetalBinary.hpp */; };
|
||||
489D7AB62550FDC900AD896A /* MetalReLU6.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A652550FDC800AD896A /* MetalReLU6.mm */; };
|
||||
|
@ -283,7 +280,6 @@
|
|||
4AF4FB2D269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx1.S in Sources */ = {isa = PBXBuildFile; fileRef = 4AF4FB2B269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx1.S */; };
|
||||
4AF4FB2E269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx4.S in Sources */ = {isa = PBXBuildFile; fileRef = 4AF4FB2C269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx4.S */; };
|
||||
4D0C80E32862FC4100C7CAD6 /* CoreMLOPRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D0C80E22862FC4100C7CAD6 /* CoreMLOPRegister.cpp */; };
|
||||
4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4D0C80E42862FC4700C7CAD6 /* CoreMLRaster.metal */; };
|
||||
4D4CF4672760946500A36D9F /* miscellaneous.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D4CF4622760946500A36D9F /* miscellaneous.cpp */; };
|
||||
4D4CF4682760946500A36D9F /* geometric.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D4CF4632760946500A36D9F /* geometric.cpp */; };
|
||||
4D4CF4692760946500A36D9F /* filter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4D4CF4642760946500A36D9F /* filter.cpp */; };
|
||||
|
@ -771,8 +767,11 @@
|
|||
CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
|
||||
CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
|
||||
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
|
||||
CE8049AC2B31C65B009B422C /* CPULayerNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE8049A92B31C65B009B422C /* CPULayerNorm.hpp */; };
|
||||
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
|
||||
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
|
||||
CEA49AA82AFD010900971CB7 /* MetalExecution.mm in Sources */ = {isa = PBXBuildFile; fileRef = CEA49AA62AFD010900971CB7 /* MetalExecution.mm */; };
|
||||
CEA49AA92AFD010900971CB7 /* MetalExecution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA49AA72AFD010900971CB7 /* MetalExecution.hpp */; };
|
||||
CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */; };
|
||||
CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */; };
|
||||
CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
|
||||
|
@ -984,14 +983,12 @@
|
|||
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
|
||||
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
|
||||
489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
|
||||
489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionGEMM.hpp; sourceTree = "<group>"; };
|
||||
489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalROIPooling.hpp; sourceTree = "<group>"; };
|
||||
489D7A1E2550FDC800AD896A /* MetalCast.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalCast.mm; sourceTree = "<group>"; };
|
||||
489D7A1F2550FDC800AD896A /* MetalRaster.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalRaster.hpp; sourceTree = "<group>"; };
|
||||
489D7A212550FDC800AD896A /* MetalReLU6.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReLU6.hpp; sourceTree = "<group>"; };
|
||||
489D7A222550FDC800AD896A /* MetalBackend.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalBackend.hpp; sourceTree = "<group>"; };
|
||||
489D7A252550FDC800AD896A /* MetalReduction.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalReduction.mm; sourceTree = "<group>"; };
|
||||
489D7A262550FDC800AD896A /* MetalConvolutionGEMM.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalConvolutionGEMM.mm; sourceTree = "<group>"; };
|
||||
489D7A272550FDC800AD896A /* MetalEltwise.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalEltwise.mm; sourceTree = "<group>"; };
|
||||
489D7A282550FDC800AD896A /* MetalConvolution1x1.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalConvolution1x1.mm; sourceTree = "<group>"; };
|
||||
489D7A2A2550FDC800AD896A /* MetalUnary.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalUnary.hpp; sourceTree = "<group>"; };
|
||||
|
@ -1027,7 +1024,6 @@
|
|||
489D7A562550FDC800AD896A /* MetalScale.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalScale.hpp; sourceTree = "<group>"; };
|
||||
489D7A572550FDC800AD896A /* MetalCast.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalCast.hpp; sourceTree = "<group>"; };
|
||||
489D7A5E2550FDC800AD896A /* MetalConvolutionWinograd.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalConvolutionWinograd.mm; sourceTree = "<group>"; };
|
||||
489D7A5F2550FDC800AD896A /* MetalDefine.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MetalDefine.h; sourceTree = "<group>"; };
|
||||
489D7A622550FDC800AD896A /* MetalPReLU.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalPReLU.mm; sourceTree = "<group>"; };
|
||||
489D7A632550FDC800AD896A /* MetalBinary.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalBinary.hpp; sourceTree = "<group>"; };
|
||||
489D7A652550FDC800AD896A /* MetalReLU6.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalReLU6.mm; sourceTree = "<group>"; };
|
||||
|
@ -1104,7 +1100,6 @@
|
|||
4AF4FB2B269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx1.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackedSparseQuantMatMulEpx1.S; sourceTree = "<group>"; };
|
||||
4AF4FB2C269ED24C005BA97B /* MNNPackedSparseQuantMatMulEpx4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackedSparseQuantMatMulEpx4.S; sourceTree = "<group>"; };
|
||||
4D0C80E22862FC4100C7CAD6 /* CoreMLOPRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CoreMLOPRegister.cpp; sourceTree = "<group>"; };
|
||||
4D0C80E42862FC4700C7CAD6 /* CoreMLRaster.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = CoreMLRaster.metal; sourceTree = "<group>"; };
|
||||
4D4CF4622760946500A36D9F /* miscellaneous.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = miscellaneous.cpp; sourceTree = "<group>"; };
|
||||
4D4CF4632760946500A36D9F /* geometric.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = geometric.cpp; sourceTree = "<group>"; };
|
||||
4D4CF4642760946500A36D9F /* filter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = filter.cpp; sourceTree = "<group>"; };
|
||||
|
@ -1603,8 +1598,11 @@
|
|||
CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
|
||||
CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
|
||||
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
|
||||
CE8049A92B31C65B009B422C /* CPULayerNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPULayerNorm.hpp; sourceTree = "<group>"; };
|
||||
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
|
||||
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
|
||||
CEA49AA62AFD010900971CB7 /* MetalExecution.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalExecution.mm; sourceTree = "<group>"; };
|
||||
CEA49AA72AFD010900971CB7 /* MetalExecution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalExecution.hpp; sourceTree = "<group>"; };
|
||||
CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IdstConvolutionInt8.cpp; sourceTree = "<group>"; };
|
||||
CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = IdstConvolutionInt8.hpp; sourceTree = "<group>"; };
|
||||
CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
|
@ -1913,6 +1911,7 @@
|
|||
48887410215B639D0079B12E /* cpu */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CE8049A92B31C65B009B422C /* CPULayerNorm.hpp */,
|
||||
958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */,
|
||||
CEE9B95F2A3AA4EF006438F2 /* CPUSoftMaxInt8.cpp */,
|
||||
CEE9B95E2A3AA4EF006438F2 /* CPUSoftMaxInt8.hpp */,
|
||||
|
@ -2096,6 +2095,8 @@
|
|||
489D7A152550FDC800AD896A /* metal */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CEA49AA72AFD010900971CB7 /* MetalExecution.hpp */,
|
||||
CEA49AA62AFD010900971CB7 /* MetalExecution.mm */,
|
||||
4D566298299341270031C1A1 /* MetalFuse.hpp */,
|
||||
4D566299299341270031C1A1 /* MetalFuse.mm */,
|
||||
19D0FE73285C66F200B74B1A /* MetalLayerNorm.hpp */,
|
||||
|
@ -2110,14 +2111,12 @@
|
|||
4838EA802611C00B0027232C /* MetalGridSample.hpp */,
|
||||
4838EA822611C00B0027232C /* MetalGridSample.mm */,
|
||||
489D7A172550FDC800AD896A /* MetalReduction.hpp */,
|
||||
489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */,
|
||||
489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */,
|
||||
489D7A1E2550FDC800AD896A /* MetalCast.mm */,
|
||||
489D7A1F2550FDC800AD896A /* MetalRaster.hpp */,
|
||||
489D7A212550FDC800AD896A /* MetalReLU6.hpp */,
|
||||
489D7A222550FDC800AD896A /* MetalBackend.hpp */,
|
||||
489D7A252550FDC800AD896A /* MetalReduction.mm */,
|
||||
489D7A262550FDC800AD896A /* MetalConvolutionGEMM.mm */,
|
||||
489D7A272550FDC800AD896A /* MetalEltwise.mm */,
|
||||
489D7A282550FDC800AD896A /* MetalConvolution1x1.mm */,
|
||||
489D7A2A2550FDC800AD896A /* MetalUnary.hpp */,
|
||||
|
@ -2153,7 +2152,6 @@
|
|||
489D7A562550FDC800AD896A /* MetalScale.hpp */,
|
||||
489D7A572550FDC800AD896A /* MetalCast.hpp */,
|
||||
489D7A5E2550FDC800AD896A /* MetalConvolutionWinograd.mm */,
|
||||
489D7A5F2550FDC800AD896A /* MetalDefine.h */,
|
||||
489D7A622550FDC800AD896A /* MetalPReLU.mm */,
|
||||
489D7A632550FDC800AD896A /* MetalBinary.hpp */,
|
||||
489D7A652550FDC800AD896A /* MetalReLU6.mm */,
|
||||
|
@ -2293,7 +2291,6 @@
|
|||
4D9A933526255BDA00F9B43C /* backend */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
4D0C80E42862FC4700C7CAD6 /* CoreMLRaster.metal */,
|
||||
4D0C80E22862FC4100C7CAD6 /* CoreMLOPRegister.cpp */,
|
||||
4D4DAE67263905390060D37E /* CoreMLDefine.h */,
|
||||
4DDE2018263809920085AC8F /* CoreMLExecutorWrapper.h */,
|
||||
|
@ -2891,6 +2888,7 @@
|
|||
C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */,
|
||||
1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */,
|
||||
1F501F872397BA5B004E8721 /* Matrix.h in Headers */,
|
||||
CE8049AC2B31C65B009B422C /* CPULayerNorm.hpp in Headers */,
|
||||
CECF8C5A299CACFD00D3875B /* WorkerThread.hpp in Headers */,
|
||||
48C84B85250F711700EE7666 /* IfModule.hpp in Headers */,
|
||||
4D9A937326255BDA00F9B43C /* CoreMLUnary.hpp in Headers */,
|
||||
|
@ -2913,7 +2911,6 @@
|
|||
92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
|
||||
4D9A937826255BDA00F9B43C /* CoreMLBinary.hpp in Headers */,
|
||||
CECF8C85299CAD9400D3875B /* log_util.h in Headers */,
|
||||
489D7AB02550FDC900AD896A /* MetalDefine.h in Headers */,
|
||||
4D6D7FD52656896600F80814 /* DenseConvolutionTiledExecutor.hpp in Headers */,
|
||||
4D9A936626255BDA00F9B43C /* CoreMLExecutor.h in Headers */,
|
||||
92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */,
|
||||
|
@ -2972,6 +2969,7 @@
|
|||
4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
|
||||
92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
|
||||
4AF4FB26269ED235005BA97B /* SparseConvInt8TiledExecutor.hpp in Headers */,
|
||||
CEA49AA92AFD010900971CB7 /* MetalExecution.hpp in Headers */,
|
||||
92FF03BC23AA0B5A00AC97F6 /* OptimizedComputer.hpp in Headers */,
|
||||
48C84BA0250F725600EE7666 /* InitNet.hpp in Headers */,
|
||||
92FF03C623AA0B5A00AC97F6 /* CPUNonMaxSuppressionV2.hpp in Headers */,
|
||||
|
@ -3091,7 +3089,6 @@
|
|||
481C2DF125FE2CD6001ED6DF /* Arm82OptFunc.hpp in Headers */,
|
||||
4A5BEC6026AAB3B30032F6BD /* CommonCompute.hpp in Headers */,
|
||||
C43C8225251894F400A0FF84 /* WingoradGenerater.hpp in Headers */,
|
||||
489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
|
@ -3404,7 +3401,6 @@
|
|||
92FF041E23AA0B7100AC97F6 /* ShapeRange.cpp in Sources */,
|
||||
489D7AA42550FDC900AD896A /* MetalROIPooling.mm in Sources */,
|
||||
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */,
|
||||
489D7A772550FDC800AD896A /* MetalConvolutionGEMM.mm in Sources */,
|
||||
92FF031623AA0B5A00AC97F6 /* MNNMatrixMax.S in Sources */,
|
||||
92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
|
||||
489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
|
||||
|
@ -3502,6 +3498,7 @@
|
|||
489D7A9A2550FDC900AD896A /* MetalConvolutionCommon.mm in Sources */,
|
||||
92FF044623AA0B7100AC97F6 /* ShapeInnerProduct.cpp in Sources */,
|
||||
48123007269EA84800EB7ABA /* CPUUnique.cpp in Sources */,
|
||||
CEA49AA82AFD010900971CB7 /* MetalExecution.mm in Sources */,
|
||||
92FF036F23AA0B5A00AC97F6 /* CPURuntime.cpp in Sources */,
|
||||
92FF039D23AA0B5A00AC97F6 /* StrassenMatmulComputor.cpp in Sources */,
|
||||
92FF030B23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
|
||||
|
@ -3594,7 +3591,6 @@
|
|||
950B28E229F627E00002F454 /* MNNBinarySubInt8.S in Sources */,
|
||||
950B28F029F627F70002F454 /* MNNBinarySubInt8.S in Sources */,
|
||||
4A224A0C27D0C2D9000A9260 /* ConvolutionPackWinograd.cpp in Sources */,
|
||||
4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
|
||||
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
|
||||
950B28FA2A0C9AC20002F454 /* CPUScaleInt8.cpp in Sources */,
|
||||
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
|
||||
|
@ -4164,7 +4160,7 @@
|
|||
IPHONEOS_DEPLOYMENT_TARGET = 9.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.v3;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.9999ve;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
|
@ -4189,7 +4185,7 @@
|
|||
IPHONEOS_DEPLOYMENT_TARGET = 9.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||
OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.v3;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.9999ve;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
|
@ -4205,7 +4201,7 @@
|
|||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = Q48UX93J22;
|
||||
DEVELOPMENT_TEAM = 6G7464HHUS;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_FILE = demo/Info.plist;
|
||||
INFOPLIST_KEY_NSCameraUsageDescription = "use camera to capture photo for demo";
|
||||
|
@ -4221,7 +4217,7 @@
|
|||
MARKETING_VERSION = 1.0;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd111;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.9999;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
|
@ -4238,7 +4234,7 @@
|
|||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = Q48UX93J22;
|
||||
DEVELOPMENT_TEAM = 6G7464HHUS;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_FILE = demo/Info.plist;
|
||||
INFOPLIST_KEY_NSCameraUsageDescription = "use camera to capture photo for demo";
|
||||
|
@ -4253,7 +4249,7 @@
|
|||
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
|
||||
MARKETING_VERSION = 1.0;
|
||||
MTL_FAST_MATH = YES;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.playground.abcd111;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.9999;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
|
|
|
@ -255,7 +255,7 @@ struct GpuCache {
|
|||
[enc setTexture:inputTexture atIndex:0];
|
||||
[enc setBuffer:_cache->_constant offset:0 atIndex:1];
|
||||
MNNMetalTensorContent sharedContent;
|
||||
MNNMetalGetTensorContent(&sharedContent, _input);
|
||||
_input->getDeviceInfo(&sharedContent, MNN_FORWARD_METAL);
|
||||
// For Metal Context to write, don't need finish, just use flush
|
||||
_input->wait(MNN::Tensor::MAP_TENSOR_WRITE, false);
|
||||
[enc setBuffer:sharedContent.buffer offset:sharedContent.offset atIndex:0];
|
||||
|
|
|
@ -78,7 +78,7 @@ def build_deps():
|
|||
if IS_WINDOWS:
|
||||
os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
|
||||
-DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON\
|
||||
-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF .. && ninja MNN MNNTrain MNNConvertDeps')
|
||||
-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF .. && ninja MNN MNNConvertDeps')
|
||||
elif IS_LINUX:
|
||||
extra_opts += '-DMNN_TENSORRT=ON \
|
||||
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
|
||||
|
@ -98,9 +98,9 @@ def build_deps():
|
|||
extra_opts += ' -DMNN_BUILD_TORCH=ON ' if USE_TORCH else ' '
|
||||
print(extra_opts)
|
||||
os.system('cmake ' + extra_opts + '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
|
||||
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF\
|
||||
-DMNN_BUILD_SHARED_LIBS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF\
|
||||
-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
|
||||
.. && make MNN MNNTrain MNNConvertDeps -j32')
|
||||
.. && make MNN MNNConvertDeps -j4')
|
||||
################################################################################
|
||||
# Building dependent libraries
|
||||
################################################################################
|
||||
|
|
|
@ -224,6 +224,9 @@ def configure_extension_build():
|
|||
|
||||
if USE_TRT:
|
||||
engine_depend += trt_depend
|
||||
if IS_DARWIN:
|
||||
lib_files += [('lib', [os.path.join(root_dir, BUILD_DIR, "libMNN.dylib")])]
|
||||
lib_files += [('lib', [os.path.join(root_dir, BUILD_DIR, "tools","converter", "libMNNConvertDeps.dylib")])]
|
||||
|
||||
if USE_CUDA:
|
||||
engine_depend += cuda_depend
|
||||
|
@ -307,9 +310,7 @@ def configure_extension_build():
|
|||
|
||||
if IS_DARWIN:
|
||||
engine_link_args += ['-stdlib=libc++']
|
||||
engine_link_args += ['-Wl,-all_load']
|
||||
engine_link_args += engine_depend
|
||||
engine_link_args += ['-Wl,-noall_load']
|
||||
if IS_LINUX:
|
||||
engine_link_args += ['-Wl,--whole-archive']
|
||||
engine_link_args += engine_depend
|
||||
|
@ -318,9 +319,7 @@ def configure_extension_build():
|
|||
if IS_WINDOWS:
|
||||
engine_link_args += ['/WHOLEARCHIVE:MNN.lib']
|
||||
if IS_DARWIN:
|
||||
tools_link_args += ['-Wl,-all_load']
|
||||
tools_link_args += tools_depend
|
||||
tools_link_args += ['-Wl,-noall_load']
|
||||
if IS_LINUX:
|
||||
tools_link_args += ['-Wl,--whole-archive']
|
||||
tools_link_args += tools_depend
|
||||
|
|
|
@ -1499,6 +1499,13 @@ static PyObject* PyMNNExpr_transpose(PyObject *self, PyObject *args) {
|
|||
}
|
||||
PyMNN_ERROR("transpose require args: (Var, [int]|Var)");
|
||||
}
|
||||
static PyObject* PyMNNExpr_reverse(PyObject *self, PyObject *args) {
|
||||
PyObject *x, *y;
|
||||
if (PyArg_ParseTuple(args, "OO", &x, &y) && isVar(x) && isVar(y)) {
|
||||
return toPyObj(Express::_Reverse(toVar(x), toVar(y)));
|
||||
}
|
||||
PyMNN_ERROR("reverse require args: (Var, Var)");
|
||||
}
|
||||
static PyObject* PyMNNExpr_reverse_sequence(PyObject *self, PyObject *args) {
|
||||
PyObject *x, *y;
|
||||
int batchDim, seqDim;
|
||||
|
@ -1839,6 +1846,7 @@ static PyMethodDef PyMNNExpr_methods[] = {
|
|||
{"transpose", PyMNNExpr_transpose, METH_VARARGS, "build transpose: (Var, [int]/Var)"},
|
||||
register_methods(Expr,
|
||||
channel_shuffle, "build channel_shuffle expr",
|
||||
reverse, "build reverse expr",
|
||||
reverse_sequence, "build reverse_sequence expr",
|
||||
crop, "build crop expr",
|
||||
resize, "build resize expr",
|
||||
|
|
|
@ -76,12 +76,6 @@ struct BatchNormT;
|
|||
struct Scale;
|
||||
struct ScaleT;
|
||||
|
||||
struct QuantizeLinear;
|
||||
struct QuantizeLinearT;
|
||||
|
||||
struct DequantizeLinear;
|
||||
struct DequantizeLinearT;
|
||||
|
||||
struct Eltwise;
|
||||
struct EltwiseT;
|
||||
|
||||
|
@ -165,10 +159,6 @@ inline const flatbuffers::TypeTable *BatchNormTypeTable();
|
|||
|
||||
inline const flatbuffers::TypeTable *ScaleTypeTable();
|
||||
|
||||
inline const flatbuffers::TypeTable *QuantizeLinearTypeTable();
|
||||
|
||||
inline const flatbuffers::TypeTable *DequantizeLinearTypeTable();
|
||||
|
||||
inline const flatbuffers::TypeTable *EltwiseTypeTable();
|
||||
|
||||
inline const flatbuffers::TypeTable *FlattenTypeTable();
|
||||
|
@ -1149,13 +1139,15 @@ struct QuantizedFloatParamT : public flatbuffers::NativeTable {
|
|||
int8_t clampMin;
|
||||
int8_t clampMax;
|
||||
std::vector<int32_t> winogradAttr;
|
||||
DataType outputDataType;
|
||||
QuantizedFloatParamT()
|
||||
: method(QuantizeAlgo_DEFAULT),
|
||||
nbits(8),
|
||||
zeroPoint(0),
|
||||
outputZeroPoint(0),
|
||||
clampMin(-128),
|
||||
clampMax(127) {
|
||||
clampMax(127),
|
||||
outputDataType(DataType_DT_INT8) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1197,6 +1189,9 @@ struct QuantizedFloatParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
|
|||
const flatbuffers::Vector<int32_t> *winogradAttr() const {
|
||||
return GetPointer<const flatbuffers::Vector<int32_t> *>(24);
|
||||
}
|
||||
DataType outputDataType() const {
|
||||
return static_cast<DataType>(GetField<int32_t>(26, 6));
|
||||
}
|
||||
bool Verify(flatbuffers::Verifier &verifier) const {
|
||||
return VerifyTableStart(verifier) &&
|
||||
VerifyOffset(verifier, 4) &&
|
||||
|
@ -1215,6 +1210,7 @@ struct QuantizedFloatParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
|
|||
VerifyField<int8_t>(verifier, 22) &&
|
||||
VerifyOffset(verifier, 24) &&
|
||||
verifier.VerifyVector(winogradAttr()) &&
|
||||
VerifyField<int32_t>(verifier, 26) &&
|
||||
verifier.EndTable();
|
||||
}
|
||||
QuantizedFloatParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
|
@ -1258,6 +1254,9 @@ struct QuantizedFloatParamBuilder {
|
|||
void add_winogradAttr(flatbuffers::Offset<flatbuffers::Vector<int32_t>> winogradAttr) {
|
||||
fbb_.AddOffset(24, winogradAttr);
|
||||
}
|
||||
void add_outputDataType(DataType outputDataType) {
|
||||
fbb_.AddElement<int32_t>(26, static_cast<int32_t>(outputDataType), 6);
|
||||
}
|
||||
explicit QuantizedFloatParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
|
||||
: fbb_(_fbb) {
|
||||
start_ = fbb_.StartTable();
|
||||
|
@ -1282,8 +1281,10 @@ inline flatbuffers::Offset<QuantizedFloatParam> CreateQuantizedFloatParam(
|
|||
int8_t outputZeroPoint = 0,
|
||||
int8_t clampMin = -128,
|
||||
int8_t clampMax = 127,
|
||||
flatbuffers::Offset<flatbuffers::Vector<int32_t>> winogradAttr = 0) {
|
||||
flatbuffers::Offset<flatbuffers::Vector<int32_t>> winogradAttr = 0,
|
||||
DataType outputDataType = DataType_DT_INT8) {
|
||||
QuantizedFloatParamBuilder builder_(_fbb);
|
||||
builder_.add_outputDataType(outputDataType);
|
||||
builder_.add_winogradAttr(winogradAttr);
|
||||
builder_.add_nbits(nbits);
|
||||
builder_.add_tensorScale(tensorScale);
|
||||
|
@ -2922,180 +2923,6 @@ inline flatbuffers::Offset<Scale> CreateScale(
|
|||
|
||||
flatbuffers::Offset<Scale> CreateScale(flatbuffers::FlatBufferBuilder &_fbb, const ScaleT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
|
||||
|
||||
struct QuantizeLinearT : public flatbuffers::NativeTable {
|
||||
typedef QuantizeLinear TableType;
|
||||
int32_t scaleSize;
|
||||
int32_t scaleAxis;
|
||||
std::vector<float> scaleData;
|
||||
std::vector<int8_t> zeroPointData;
|
||||
QuantizeLinearT()
|
||||
: scaleSize(0),
|
||||
scaleAxis(0) {
|
||||
}
|
||||
};
|
||||
|
||||
struct QuantizeLinear FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
|
||||
typedef QuantizeLinearT NativeTableType;
|
||||
static const flatbuffers::TypeTable *MiniReflectTypeTable() {
|
||||
return QuantizeLinearTypeTable();
|
||||
}
|
||||
int32_t scaleSize() const {
|
||||
return GetField<int32_t>(4, 0);
|
||||
}
|
||||
int32_t scaleAxis() const {
|
||||
return GetField<int32_t>(6, 0);
|
||||
}
|
||||
const flatbuffers::Vector<float> *scaleData() const {
|
||||
return GetPointer<const flatbuffers::Vector<float> *>(8);
|
||||
}
|
||||
const flatbuffers::Vector<int8_t> *zeroPointData() const {
|
||||
return GetPointer<const flatbuffers::Vector<int8_t> *>(10);
|
||||
}
|
||||
bool Verify(flatbuffers::Verifier &verifier) const {
|
||||
return VerifyTableStart(verifier) &&
|
||||
VerifyField<int32_t>(verifier, 4) &&
|
||||
VerifyField<int32_t>(verifier, 6) &&
|
||||
VerifyOffset(verifier, 8) &&
|
||||
verifier.VerifyVector(scaleData()) &&
|
||||
VerifyOffset(verifier, 10) &&
|
||||
verifier.VerifyVector(zeroPointData()) &&
|
||||
verifier.EndTable();
|
||||
}
|
||||
QuantizeLinearT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
void UnPackTo(QuantizeLinearT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
static flatbuffers::Offset<QuantizeLinear> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeLinearT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
|
||||
};
|
||||
|
||||
struct QuantizeLinearBuilder {
|
||||
flatbuffers::FlatBufferBuilder &fbb_;
|
||||
flatbuffers::uoffset_t start_;
|
||||
void add_scaleSize(int32_t scaleSize) {
|
||||
fbb_.AddElement<int32_t>(4, scaleSize, 0);
|
||||
}
|
||||
void add_scaleAxis(int32_t scaleAxis) {
|
||||
fbb_.AddElement<int32_t>(6, scaleAxis, 0);
|
||||
}
|
||||
void add_scaleData(flatbuffers::Offset<flatbuffers::Vector<float>> scaleData) {
|
||||
fbb_.AddOffset(8, scaleData);
|
||||
}
|
||||
void add_zeroPointData(flatbuffers::Offset<flatbuffers::Vector<int8_t>> zeroPointData) {
|
||||
fbb_.AddOffset(10, zeroPointData);
|
||||
}
|
||||
explicit QuantizeLinearBuilder(flatbuffers::FlatBufferBuilder &_fbb)
|
||||
: fbb_(_fbb) {
|
||||
start_ = fbb_.StartTable();
|
||||
}
|
||||
QuantizeLinearBuilder &operator=(const QuantizeLinearBuilder &);
|
||||
flatbuffers::Offset<QuantizeLinear> Finish() {
|
||||
const auto end = fbb_.EndTable(start_);
|
||||
auto o = flatbuffers::Offset<QuantizeLinear>(end);
|
||||
return o;
|
||||
}
|
||||
};
|
||||
|
||||
inline flatbuffers::Offset<QuantizeLinear> CreateQuantizeLinear(
|
||||
flatbuffers::FlatBufferBuilder &_fbb,
|
||||
int32_t scaleSize = 0,
|
||||
int32_t scaleAxis = 0,
|
||||
flatbuffers::Offset<flatbuffers::Vector<float>> scaleData = 0,
|
||||
flatbuffers::Offset<flatbuffers::Vector<int8_t>> zeroPointData = 0) {
|
||||
QuantizeLinearBuilder builder_(_fbb);
|
||||
builder_.add_zeroPointData(zeroPointData);
|
||||
builder_.add_scaleData(scaleData);
|
||||
builder_.add_scaleAxis(scaleAxis);
|
||||
builder_.add_scaleSize(scaleSize);
|
||||
return builder_.Finish();
|
||||
}
|
||||
|
||||
flatbuffers::Offset<QuantizeLinear> CreateQuantizeLinear(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeLinearT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
|
||||
|
||||
struct DequantizeLinearT : public flatbuffers::NativeTable {
|
||||
typedef DequantizeLinear TableType;
|
||||
int32_t scaleSize;
|
||||
int32_t scaleAxis;
|
||||
std::vector<float> scaleData;
|
||||
std::vector<int8_t> zeroPointData;
|
||||
DequantizeLinearT()
|
||||
: scaleSize(0),
|
||||
scaleAxis(0) {
|
||||
}
|
||||
};
|
||||
|
||||
struct DequantizeLinear FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
|
||||
typedef DequantizeLinearT NativeTableType;
|
||||
static const flatbuffers::TypeTable *MiniReflectTypeTable() {
|
||||
return DequantizeLinearTypeTable();
|
||||
}
|
||||
int32_t scaleSize() const {
|
||||
return GetField<int32_t>(4, 0);
|
||||
}
|
||||
int32_t scaleAxis() const {
|
||||
return GetField<int32_t>(6, 0);
|
||||
}
|
||||
const flatbuffers::Vector<float> *scaleData() const {
|
||||
return GetPointer<const flatbuffers::Vector<float> *>(8);
|
||||
}
|
||||
const flatbuffers::Vector<int8_t> *zeroPointData() const {
|
||||
return GetPointer<const flatbuffers::Vector<int8_t> *>(10);
|
||||
}
|
||||
bool Verify(flatbuffers::Verifier &verifier) const {
|
||||
return VerifyTableStart(verifier) &&
|
||||
VerifyField<int32_t>(verifier, 4) &&
|
||||
VerifyField<int32_t>(verifier, 6) &&
|
||||
VerifyOffset(verifier, 8) &&
|
||||
verifier.VerifyVector(scaleData()) &&
|
||||
VerifyOffset(verifier, 10) &&
|
||||
verifier.VerifyVector(zeroPointData()) &&
|
||||
verifier.EndTable();
|
||||
}
|
||||
DequantizeLinearT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
void UnPackTo(DequantizeLinearT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
static flatbuffers::Offset<DequantizeLinear> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeLinearT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
|
||||
};
|
||||
|
||||
struct DequantizeLinearBuilder {
|
||||
flatbuffers::FlatBufferBuilder &fbb_;
|
||||
flatbuffers::uoffset_t start_;
|
||||
void add_scaleSize(int32_t scaleSize) {
|
||||
fbb_.AddElement<int32_t>(4, scaleSize, 0);
|
||||
}
|
||||
void add_scaleAxis(int32_t scaleAxis) {
|
||||
fbb_.AddElement<int32_t>(6, scaleAxis, 0);
|
||||
}
|
||||
void add_scaleData(flatbuffers::Offset<flatbuffers::Vector<float>> scaleData) {
|
||||
fbb_.AddOffset(8, scaleData);
|
||||
}
|
||||
void add_zeroPointData(flatbuffers::Offset<flatbuffers::Vector<int8_t>> zeroPointData) {
|
||||
fbb_.AddOffset(10, zeroPointData);
|
||||
}
|
||||
explicit DequantizeLinearBuilder(flatbuffers::FlatBufferBuilder &_fbb)
|
||||
: fbb_(_fbb) {
|
||||
start_ = fbb_.StartTable();
|
||||
}
|
||||
DequantizeLinearBuilder &operator=(const DequantizeLinearBuilder &);
|
||||
flatbuffers::Offset<DequantizeLinear> Finish() {
|
||||
const auto end = fbb_.EndTable(start_);
|
||||
auto o = flatbuffers::Offset<DequantizeLinear>(end);
|
||||
return o;
|
||||
}
|
||||
};
|
||||
|
||||
inline flatbuffers::Offset<DequantizeLinear> CreateDequantizeLinear(
|
||||
flatbuffers::FlatBufferBuilder &_fbb,
|
||||
int32_t scaleSize = 0,
|
||||
int32_t scaleAxis = 0,
|
||||
flatbuffers::Offset<flatbuffers::Vector<float>> scaleData = 0,
|
||||
flatbuffers::Offset<flatbuffers::Vector<int8_t>> zeroPointData = 0) {
|
||||
DequantizeLinearBuilder builder_(_fbb);
|
||||
builder_.add_zeroPointData(zeroPointData);
|
||||
builder_.add_scaleData(scaleData);
|
||||
builder_.add_scaleAxis(scaleAxis);
|
||||
builder_.add_scaleSize(scaleSize);
|
||||
return builder_.Finish();
|
||||
}
|
||||
|
||||
flatbuffers::Offset<DequantizeLinear> CreateDequantizeLinear(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeLinearT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
|
||||
|
||||
struct EltwiseT : public flatbuffers::NativeTable {
|
||||
typedef Eltwise TableType;
|
||||
EltwiseType type;
|
||||
|
@ -4672,6 +4499,7 @@ inline void QuantizedFloatParam::UnPackTo(QuantizedFloatParamT *_o, const flatbu
|
|||
{ auto _e = clampMin(); _o->clampMin = _e; };
|
||||
{ auto _e = clampMax(); _o->clampMax = _e; };
|
||||
{ auto _e = winogradAttr(); if (_e) { _o->winogradAttr.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->winogradAttr[_i] = _e->Get(_i); } } };
|
||||
{ auto _e = outputDataType(); _o->outputDataType = _e; };
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<QuantizedFloatParam> QuantizedFloatParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizedFloatParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
|
@ -4693,6 +4521,7 @@ inline flatbuffers::Offset<QuantizedFloatParam> CreateQuantizedFloatParam(flatbu
|
|||
auto _clampMin = _o->clampMin;
|
||||
auto _clampMax = _o->clampMax;
|
||||
auto _winogradAttr = _o->winogradAttr.size() ? _fbb.CreateVector(_o->winogradAttr) : 0;
|
||||
auto _outputDataType = _o->outputDataType;
|
||||
return MNN::CreateQuantizedFloatParam(
|
||||
_fbb,
|
||||
_weight,
|
||||
|
@ -4705,7 +4534,8 @@ inline flatbuffers::Offset<QuantizedFloatParam> CreateQuantizedFloatParam(flatbu
|
|||
_outputZeroPoint,
|
||||
_clampMin,
|
||||
_clampMax,
|
||||
_winogradAttr);
|
||||
_winogradAttr,
|
||||
_outputDataType);
|
||||
}
|
||||
|
||||
inline Convolution2DT *Convolution2D::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
|
||||
|
@ -5342,76 +5172,6 @@ inline flatbuffers::Offset<Scale> CreateScale(flatbuffers::FlatBufferBuilder &_f
|
|||
_external);
|
||||
}
|
||||
|
||||
inline QuantizeLinearT *QuantizeLinear::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
|
||||
auto _o = new QuantizeLinearT();
|
||||
UnPackTo(_o, _resolver);
|
||||
return _o;
|
||||
}
|
||||
|
||||
inline void QuantizeLinear::UnPackTo(QuantizeLinearT *_o, const flatbuffers::resolver_function_t *_resolver) const {
|
||||
(void)_o;
|
||||
(void)_resolver;
|
||||
{ auto _e = scaleSize(); _o->scaleSize = _e; };
|
||||
{ auto _e = scaleAxis(); _o->scaleAxis = _e; };
|
||||
{ auto _e = scaleData(); if (_e) { _o->scaleData.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scaleData[_i] = _e->Get(_i); } } };
|
||||
{ auto _e = zeroPointData(); if (_e) { _o->zeroPointData.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zeroPointData[_i] = _e->Get(_i); } } };
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<QuantizeLinear> QuantizeLinear::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeLinearT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
return CreateQuantizeLinear(_fbb, _o, _rehasher);
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<QuantizeLinear> CreateQuantizeLinear(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeLinearT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
(void)_rehasher;
|
||||
(void)_o;
|
||||
struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizeLinearT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
|
||||
auto _scaleSize = _o->scaleSize;
|
||||
auto _scaleAxis = _o->scaleAxis;
|
||||
auto _scaleData = _o->scaleData.size() ? _fbb.CreateVector(_o->scaleData) : 0;
|
||||
auto _zeroPointData = _o->zeroPointData.size() ? _fbb.CreateVector(_o->zeroPointData) : 0;
|
||||
return MNN::CreateQuantizeLinear(
|
||||
_fbb,
|
||||
_scaleSize,
|
||||
_scaleAxis,
|
||||
_scaleData,
|
||||
_zeroPointData);
|
||||
}
|
||||
|
||||
inline DequantizeLinearT *DequantizeLinear::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
|
||||
auto _o = new DequantizeLinearT();
|
||||
UnPackTo(_o, _resolver);
|
||||
return _o;
|
||||
}
|
||||
|
||||
inline void DequantizeLinear::UnPackTo(DequantizeLinearT *_o, const flatbuffers::resolver_function_t *_resolver) const {
|
||||
(void)_o;
|
||||
(void)_resolver;
|
||||
{ auto _e = scaleSize(); _o->scaleSize = _e; };
|
||||
{ auto _e = scaleAxis(); _o->scaleAxis = _e; };
|
||||
{ auto _e = scaleData(); if (_e) { _o->scaleData.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scaleData[_i] = _e->Get(_i); } } };
|
||||
{ auto _e = zeroPointData(); if (_e) { _o->zeroPointData.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zeroPointData[_i] = _e->Get(_i); } } };
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<DequantizeLinear> DequantizeLinear::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeLinearT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
return CreateDequantizeLinear(_fbb, _o, _rehasher);
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<DequantizeLinear> CreateDequantizeLinear(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeLinearT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
(void)_rehasher;
|
||||
(void)_o;
|
||||
struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DequantizeLinearT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
|
||||
auto _scaleSize = _o->scaleSize;
|
||||
auto _scaleAxis = _o->scaleAxis;
|
||||
auto _scaleData = _o->scaleData.size() ? _fbb.CreateVector(_o->scaleData) : 0;
|
||||
auto _zeroPointData = _o->zeroPointData.size() ? _fbb.CreateVector(_o->zeroPointData) : 0;
|
||||
return MNN::CreateDequantizeLinear(
|
||||
_fbb,
|
||||
_scaleSize,
|
||||
_scaleAxis,
|
||||
_scaleData,
|
||||
_zeroPointData);
|
||||
}
|
||||
|
||||
inline EltwiseT *Eltwise::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
|
||||
auto _o = new EltwiseT();
|
||||
UnPackTo(_o, _resolver);
|
||||
|
@ -6243,10 +6003,12 @@ inline const flatbuffers::TypeTable *QuantizedFloatParamTypeTable() {
|
|||
{ flatbuffers::ET_CHAR, 0, -1 },
|
||||
{ flatbuffers::ET_CHAR, 0, -1 },
|
||||
{ flatbuffers::ET_CHAR, 0, -1 },
|
||||
{ flatbuffers::ET_INT, 1, -1 }
|
||||
{ flatbuffers::ET_INT, 1, -1 },
|
||||
{ flatbuffers::ET_INT, 0, 1 }
|
||||
};
|
||||
static const flatbuffers::TypeFunction type_refs[] = {
|
||||
QuantizeAlgoTypeTable
|
||||
QuantizeAlgoTypeTable,
|
||||
DataTypeTypeTable
|
||||
};
|
||||
static const char * const names[] = {
|
||||
"weight",
|
||||
|
@ -6259,10 +6021,11 @@ inline const flatbuffers::TypeTable *QuantizedFloatParamTypeTable() {
|
|||
"outputZeroPoint",
|
||||
"clampMin",
|
||||
"clampMax",
|
||||
"winogradAttr"
|
||||
"winogradAttr",
|
||||
"outputDataType"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_TABLE, 11, type_codes, type_refs, nullptr, names
|
||||
flatbuffers::ST_TABLE, 12, type_codes, type_refs, nullptr, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
@ -6648,44 +6411,6 @@ inline const flatbuffers::TypeTable *ScaleTypeTable() {
|
|||
return &tt;
|
||||
}
|
||||
|
||||
inline const flatbuffers::TypeTable *QuantizeLinearTypeTable() {
|
||||
static const flatbuffers::TypeCode type_codes[] = {
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_FLOAT, 1, -1 },
|
||||
{ flatbuffers::ET_CHAR, 1, -1 }
|
||||
};
|
||||
static const char * const names[] = {
|
||||
"scaleSize",
|
||||
"scaleAxis",
|
||||
"scaleData",
|
||||
"zeroPointData"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
||||
inline const flatbuffers::TypeTable *DequantizeLinearTypeTable() {
|
||||
static const flatbuffers::TypeCode type_codes[] = {
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_FLOAT, 1, -1 },
|
||||
{ flatbuffers::ET_CHAR, 1, -1 }
|
||||
};
|
||||
static const char * const names[] = {
|
||||
"scaleSize",
|
||||
"scaleAxis",
|
||||
"scaleData",
|
||||
"zeroPointData"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
||||
inline const flatbuffers::TypeTable *EltwiseTypeTable() {
|
||||
static const flatbuffers::TypeCode type_codes[] = {
|
||||
{ flatbuffers::ET_CHAR, 0, 0 },
|
||||
|
|
|
@ -236,8 +236,6 @@ enum OpType {
|
|||
OpType_GatherElements = 152,
|
||||
OpType_Svd = 153,
|
||||
OpType_Histogram = 154,
|
||||
OpType_QuantizeLinear = 155,
|
||||
OpType_DequantizeLinear = 156,
|
||||
OpType_Plugin = 256,
|
||||
OpType_Select = 257,
|
||||
OpType_ZerosLike = 258,
|
||||
|
@ -267,7 +265,7 @@ enum OpType {
|
|||
OpType_MAX = OpType_GridSample
|
||||
};
|
||||
|
||||
inline const OpType (&EnumValuesOpType())[177] {
|
||||
inline const OpType (&EnumValuesOpType())[175] {
|
||||
static const OpType values[] = {
|
||||
OpType_AbsVal,
|
||||
OpType_QuantizedAdd,
|
||||
|
@ -419,8 +417,6 @@ inline const OpType (&EnumValuesOpType())[177] {
|
|||
OpType_GatherElements,
|
||||
OpType_Svd,
|
||||
OpType_Histogram,
|
||||
OpType_QuantizeLinear,
|
||||
OpType_DequantizeLinear,
|
||||
OpType_Plugin,
|
||||
OpType_Select,
|
||||
OpType_ZerosLike,
|
||||
|
@ -607,8 +603,8 @@ inline const char * const *EnumNamesOpType() {
|
|||
"GatherElements",
|
||||
"Svd",
|
||||
"Histogram",
|
||||
"QuantizeLinear",
|
||||
"DequantizeLinear",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
|
@ -1164,13 +1160,11 @@ enum OpParameter {
|
|||
OpParameter_LoopParam = 92,
|
||||
OpParameter_ImageProcessParam = 93,
|
||||
OpParameter_CumSum = 94,
|
||||
OpParameter_QuantizeLinear = 95,
|
||||
OpParameter_DequantizeLinear = 96,
|
||||
OpParameter_MIN = OpParameter_NONE,
|
||||
OpParameter_MAX = OpParameter_DequantizeLinear
|
||||
OpParameter_MAX = OpParameter_CumSum
|
||||
};
|
||||
|
||||
inline const OpParameter (&EnumValuesOpParameter())[97] {
|
||||
inline const OpParameter (&EnumValuesOpParameter())[95] {
|
||||
static const OpParameter values[] = {
|
||||
OpParameter_NONE,
|
||||
OpParameter_QuantizedAdd,
|
||||
|
@ -1266,9 +1260,7 @@ inline const OpParameter (&EnumValuesOpParameter())[97] {
|
|||
OpParameter_GridSample,
|
||||
OpParameter_LoopParam,
|
||||
OpParameter_ImageProcessParam,
|
||||
OpParameter_CumSum,
|
||||
OpParameter_QuantizeLinear,
|
||||
OpParameter_DequantizeLinear
|
||||
OpParameter_CumSum
|
||||
};
|
||||
return values;
|
||||
}
|
||||
|
@ -1370,15 +1362,13 @@ inline const char * const *EnumNamesOpParameter() {
|
|||
"LoopParam",
|
||||
"ImageProcessParam",
|
||||
"CumSum",
|
||||
"QuantizeLinear",
|
||||
"DequantizeLinear",
|
||||
nullptr
|
||||
};
|
||||
return names;
|
||||
}
|
||||
|
||||
inline const char *EnumNameOpParameter(OpParameter e) {
|
||||
if (e < OpParameter_NONE || e > OpParameter_DequantizeLinear) return "";
|
||||
if (e < OpParameter_NONE || e > OpParameter_CumSum) return "";
|
||||
const size_t index = static_cast<int>(e);
|
||||
return EnumNamesOpParameter()[index];
|
||||
}
|
||||
|
@ -1763,14 +1753,6 @@ template<> struct OpParameterTraits<CumSum> {
|
|||
static const OpParameter enum_value = OpParameter_CumSum;
|
||||
};
|
||||
|
||||
template<> struct OpParameterTraits<QuantizeLinear> {
|
||||
static const OpParameter enum_value = OpParameter_QuantizeLinear;
|
||||
};
|
||||
|
||||
template<> struct OpParameterTraits<DequantizeLinear> {
|
||||
static const OpParameter enum_value = OpParameter_DequantizeLinear;
|
||||
};
|
||||
|
||||
struct OpParameterUnion {
|
||||
OpParameter type;
|
||||
void *value;
|
||||
|
@ -2554,22 +2536,6 @@ struct OpParameterUnion {
|
|||
return type == OpParameter_CumSum ?
|
||||
reinterpret_cast<const CumSumT *>(value) : nullptr;
|
||||
}
|
||||
QuantizeLinearT *AsQuantizeLinear() {
|
||||
return type == OpParameter_QuantizeLinear ?
|
||||
reinterpret_cast<QuantizeLinearT *>(value) : nullptr;
|
||||
}
|
||||
const QuantizeLinearT *AsQuantizeLinear() const {
|
||||
return type == OpParameter_QuantizeLinear ?
|
||||
reinterpret_cast<const QuantizeLinearT *>(value) : nullptr;
|
||||
}
|
||||
DequantizeLinearT *AsDequantizeLinear() {
|
||||
return type == OpParameter_DequantizeLinear ?
|
||||
reinterpret_cast<DequantizeLinearT *>(value) : nullptr;
|
||||
}
|
||||
const DequantizeLinearT *AsDequantizeLinear() const {
|
||||
return type == OpParameter_DequantizeLinear ?
|
||||
reinterpret_cast<const DequantizeLinearT *>(value) : nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj, OpParameter type);
|
||||
|
@ -3633,12 +3599,6 @@ struct Op FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
|
|||
const CumSum *main_as_CumSum() const {
|
||||
return main_type() == OpParameter_CumSum ? static_cast<const CumSum *>(main()) : nullptr;
|
||||
}
|
||||
const QuantizeLinear *main_as_QuantizeLinear() const {
|
||||
return main_type() == OpParameter_QuantizeLinear ? static_cast<const QuantizeLinear *>(main()) : nullptr;
|
||||
}
|
||||
const DequantizeLinear *main_as_DequantizeLinear() const {
|
||||
return main_type() == OpParameter_DequantizeLinear ? static_cast<const DequantizeLinear *>(main()) : nullptr;
|
||||
}
|
||||
const flatbuffers::String *name() const {
|
||||
return GetPointer<const flatbuffers::String *>(10);
|
||||
}
|
||||
|
@ -4047,14 +4007,6 @@ template<> inline const CumSum *Op::main_as<CumSum>() const {
|
|||
return main_as_CumSum();
|
||||
}
|
||||
|
||||
template<> inline const QuantizeLinear *Op::main_as<QuantizeLinear>() const {
|
||||
return main_as_QuantizeLinear();
|
||||
}
|
||||
|
||||
template<> inline const DequantizeLinear *Op::main_as<DequantizeLinear>() const {
|
||||
return main_as_DequantizeLinear();
|
||||
}
|
||||
|
||||
struct OpBuilder {
|
||||
flatbuffers::FlatBufferBuilder &fbb_;
|
||||
flatbuffers::uoffset_t start_;
|
||||
|
@ -5676,14 +5628,6 @@ inline bool VerifyOpParameter(flatbuffers::Verifier &verifier, const void *obj,
|
|||
auto ptr = reinterpret_cast<const CumSum *>(obj);
|
||||
return verifier.VerifyTable(ptr);
|
||||
}
|
||||
case OpParameter_QuantizeLinear: {
|
||||
auto ptr = reinterpret_cast<const QuantizeLinear *>(obj);
|
||||
return verifier.VerifyTable(ptr);
|
||||
}
|
||||
case OpParameter_DequantizeLinear: {
|
||||
auto ptr = reinterpret_cast<const DequantizeLinear *>(obj);
|
||||
return verifier.VerifyTable(ptr);
|
||||
}
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
@ -6078,14 +6022,6 @@ inline void *OpParameterUnion::UnPack(const void *obj, OpParameter type, const f
|
|||
auto ptr = reinterpret_cast<const CumSum *>(obj);
|
||||
return ptr->UnPack(resolver);
|
||||
}
|
||||
case OpParameter_QuantizeLinear: {
|
||||
auto ptr = reinterpret_cast<const QuantizeLinear *>(obj);
|
||||
return ptr->UnPack(resolver);
|
||||
}
|
||||
case OpParameter_DequantizeLinear: {
|
||||
auto ptr = reinterpret_cast<const DequantizeLinear *>(obj);
|
||||
return ptr->UnPack(resolver);
|
||||
}
|
||||
default: return nullptr;
|
||||
}
|
||||
}
|
||||
|
@ -6468,14 +6404,6 @@ inline flatbuffers::Offset<void> OpParameterUnion::Pack(flatbuffers::FlatBufferB
|
|||
auto ptr = reinterpret_cast<const CumSumT *>(value);
|
||||
return CreateCumSum(_fbb, ptr, _rehasher).Union();
|
||||
}
|
||||
case OpParameter_QuantizeLinear: {
|
||||
auto ptr = reinterpret_cast<const QuantizeLinearT *>(value);
|
||||
return CreateQuantizeLinear(_fbb, ptr, _rehasher).Union();
|
||||
}
|
||||
case OpParameter_DequantizeLinear: {
|
||||
auto ptr = reinterpret_cast<const DequantizeLinearT *>(value);
|
||||
return CreateDequantizeLinear(_fbb, ptr, _rehasher).Union();
|
||||
}
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
@ -6858,14 +6786,6 @@ inline OpParameterUnion::OpParameterUnion(const OpParameterUnion &u) FLATBUFFERS
|
|||
value = new CumSumT(*reinterpret_cast<CumSumT *>(u.value));
|
||||
break;
|
||||
}
|
||||
case OpParameter_QuantizeLinear: {
|
||||
value = new QuantizeLinearT(*reinterpret_cast<QuantizeLinearT *>(u.value));
|
||||
break;
|
||||
}
|
||||
case OpParameter_DequantizeLinear: {
|
||||
value = new DequantizeLinearT(*reinterpret_cast<DequantizeLinearT *>(u.value));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -7343,16 +7263,6 @@ inline void OpParameterUnion::Reset() {
|
|||
delete ptr;
|
||||
break;
|
||||
}
|
||||
case OpParameter_QuantizeLinear: {
|
||||
auto ptr = reinterpret_cast<QuantizeLinearT *>(value);
|
||||
delete ptr;
|
||||
break;
|
||||
}
|
||||
case OpParameter_DequantizeLinear: {
|
||||
auto ptr = reinterpret_cast<DequantizeLinearT *>(value);
|
||||
delete ptr;
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
value = nullptr;
|
||||
|
@ -7535,14 +7445,12 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
|
|||
{ flatbuffers::ET_INT, 0, 0 },
|
||||
{ flatbuffers::ET_INT, 0, 0 },
|
||||
{ flatbuffers::ET_INT, 0, 0 },
|
||||
{ flatbuffers::ET_INT, 0, 0 },
|
||||
{ flatbuffers::ET_INT, 0, 0 },
|
||||
{ flatbuffers::ET_INT, 0, 0 }
|
||||
};
|
||||
static const flatbuffers::TypeFunction type_refs[] = {
|
||||
OpTypeTypeTable
|
||||
};
|
||||
static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
|
||||
static const int64_t values[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 512, 513, 514, 515, 516, 517, 518, 600, 601, 603, 604 };
|
||||
static const char * const names[] = {
|
||||
"AbsVal",
|
||||
"QuantizedAdd",
|
||||
|
@ -7694,8 +7602,6 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
|
|||
"GatherElements",
|
||||
"Svd",
|
||||
"Histogram",
|
||||
"QuantizeLinear",
|
||||
"DequantizeLinear",
|
||||
"Plugin",
|
||||
"Select",
|
||||
"ZerosLike",
|
||||
|
@ -7723,7 +7629,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
|
|||
"GridSample"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_ENUM, 177, type_codes, type_refs, values, names
|
||||
flatbuffers::ST_ENUM, 175, type_codes, type_refs, values, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
@ -7824,9 +7730,7 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
|
|||
{ flatbuffers::ET_SEQUENCE, 0, 90 },
|
||||
{ flatbuffers::ET_SEQUENCE, 0, 91 },
|
||||
{ flatbuffers::ET_SEQUENCE, 0, 92 },
|
||||
{ flatbuffers::ET_SEQUENCE, 0, 93 },
|
||||
{ flatbuffers::ET_SEQUENCE, 0, 94 },
|
||||
{ flatbuffers::ET_SEQUENCE, 0, 95 }
|
||||
{ flatbuffers::ET_SEQUENCE, 0, 93 }
|
||||
};
|
||||
static const flatbuffers::TypeFunction type_refs[] = {
|
||||
QuantizedAddTypeTable,
|
||||
|
@ -7922,9 +7826,7 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
|
|||
GridSampleTypeTable,
|
||||
LoopParamTypeTable,
|
||||
ImageProcessParamTypeTable,
|
||||
CumSumTypeTable,
|
||||
QuantizeLinearTypeTable,
|
||||
DequantizeLinearTypeTable
|
||||
CumSumTypeTable
|
||||
};
|
||||
static const char * const names[] = {
|
||||
"NONE",
|
||||
|
@ -8021,12 +7923,10 @@ inline const flatbuffers::TypeTable *OpParameterTypeTable() {
|
|||
"GridSample",
|
||||
"LoopParam",
|
||||
"ImageProcessParam",
|
||||
"CumSum",
|
||||
"QuantizeLinear",
|
||||
"DequantizeLinear"
|
||||
"CumSum"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_UNION, 97, type_codes, type_refs, nullptr, names
|
||||
flatbuffers::ST_UNION, 95, type_codes, type_refs, nullptr, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
|
|
@ -95,6 +95,7 @@ table QuantizedFloatParam{
|
|||
clampMax: byte = 127;
|
||||
// binary proto: [originKySize, originKxSize, transKySize, transKxSize, {kyStart, kxStart, unitY, unitX}, {...} ...]
|
||||
winogradAttr:[int];
|
||||
outputDataType:DataType=DT_INT8;
|
||||
}
|
||||
|
||||
table Convolution2D {
|
||||
|
@ -247,20 +248,6 @@ table Scale {
|
|||
external:[int64]; // [offset, scaleData_bytes_size, biasData_bytes_size]
|
||||
}
|
||||
|
||||
table QuantizeLinear {
|
||||
scaleSize: int;
|
||||
scaleAxis: int;
|
||||
scaleData:[float];
|
||||
zeroPointData:[byte];
|
||||
}
|
||||
|
||||
table DequantizeLinear {
|
||||
scaleSize: int;
|
||||
scaleAxis: int;
|
||||
scaleData:[float];
|
||||
zeroPointData:[byte];
|
||||
}
|
||||
|
||||
enum EltwiseType : byte {
|
||||
PROD = 0,
|
||||
SUM = 1,
|
||||
|
|
|
@ -167,8 +167,6 @@ enum OpType : int {
|
|||
GatherElements = 152,
|
||||
Svd = 153,
|
||||
Histogram = 154,
|
||||
QuantizeLinear = 155,
|
||||
DequantizeLinear = 156,
|
||||
|
||||
Plugin = 256, //The Type load from plugin
|
||||
//Training Op Start from 257
|
||||
|
@ -392,8 +390,6 @@ union OpParameter {
|
|||
LoopParam,
|
||||
ImageProcessParam,
|
||||
CumSum,
|
||||
QuantizeLinear,
|
||||
DequantizeLinear,
|
||||
}
|
||||
|
||||
table Op {
|
||||
|
|
|
@ -62,10 +62,12 @@ vadd.f16 q3, q3, q1
|
|||
vmul.f16 q2, q2, q14
|
||||
vmul.f16 q3, q3, q14
|
||||
|
||||
mov lr, #5.0
|
||||
mov lr, #5
|
||||
vdup.16 q4, lr
|
||||
mov lr, #-5.0
|
||||
vcvt.f32.s32 q4, q4
|
||||
mov lr, #-5
|
||||
vdup.16 q5, lr
|
||||
vcvt.f32.s32 q5, q5
|
||||
vmax.f16 q2, q2, q5
|
||||
vmin.f16 q2, q2, q4
|
||||
vmax.f16 q3, q3, q5
|
||||
|
|
|
@ -45,8 +45,8 @@ dup v10.8h, w9 // v10: [28.f]x4
|
|||
dup v9.8h, w10 // v9: [3150.f]x4
|
||||
dup v8.8h, w11 // v8: [62370.f]x4
|
||||
|
||||
mov w4, #5.0
|
||||
mov w5, #-5.0
|
||||
mov w4, #5
|
||||
mov w5, #-5
|
||||
|
||||
GeluZLoop:
|
||||
|
||||
|
@ -67,6 +67,8 @@ fmul v3.8h, v3.8h, v14.8h
|
|||
|
||||
dup v6.8h, w5
|
||||
dup v7.8h, w4
|
||||
scvtf v6.8h, v6.8h
|
||||
scvtf v7.8h, v7.8h
|
||||
fmin v2.8h, v2.8h, v7.8h
|
||||
fmin v3.8h, v3.8h, v7.8h
|
||||
fmax v2.8h, v2.8h, v6.8h
|
||||
|
|
|
@ -195,6 +195,9 @@ Execution *CPUCastCreator::onCreate(const std::vector<Tensor *> &inputs, const s
|
|||
if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
|
||||
return new CastDataType<float, int8_t>(backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_INT8 && halide_type_of<int32_t>() == inputDataType) {
|
||||
return new CastDataType<int32_t, int8_t>(backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
|
||||
return new CastDataType<float, uint8_t>(backend);
|
||||
}
|
||||
|
|
|
@ -1,87 +0,0 @@
|
|||
//
|
||||
// CPUDequantizeLinear.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/07/15.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "backend/cpu/CPUDequantizeLinear.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "compute/CommonOptFunction.h"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
CPUDequantizeLinear::CPUDequantizeLinear(Backend *b, float* scale, int8_t* zeroPoints, int size, int axis, int inputBits) : MNN::Execution(b){
|
||||
mSize = size;
|
||||
mAxis = axis;
|
||||
mInputBits = inputBits;
|
||||
}
|
||||
ErrorCode CPUDequantizeLinear::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
if (mInputBits == 8) {
|
||||
mFunc = dequantizeFunc<int8_t>;
|
||||
} else if (mInputBits == 16) {
|
||||
mFunc = dequantizeFunc<int16_t>;
|
||||
} else {
|
||||
mFunc = dequantizeFunc<int32_t>;
|
||||
}
|
||||
float *scale = inputs[1]->host<float>();
|
||||
int8_t *zero = nullptr;
|
||||
if (inputs.size() > 2) {
|
||||
zero = inputs[2]->host<int8_t>();;
|
||||
}
|
||||
if (mSize == 1) {
|
||||
mQuantScales.resize(4, *scale);
|
||||
if (nullptr != zero) {
|
||||
mQuantZeroPoints.resize(4, *zero);
|
||||
} else {
|
||||
mQuantZeroPoints.resize(4, 0);
|
||||
}
|
||||
} else {
|
||||
mQuantScales.resize(mSize);
|
||||
::memcpy(mQuantScales.data(), scale, sizeof(float) * mSize);
|
||||
if (nullptr != zero) {
|
||||
mQuantZeroPoints.resize(mSize);
|
||||
::memcpy(mQuantZeroPoints.data(), zero, mSize);
|
||||
} else {
|
||||
mQuantZeroPoints.resize(mSize);
|
||||
}
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
ErrorCode CPUDequantizeLinear::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = inputs[0];
|
||||
int N = input->length(0);
|
||||
ssize_t size = N;
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
auto dst = outputs[0]->host<float>();
|
||||
auto src = input->host<int8_t>();
|
||||
mFunc(dst, src, input->dimensions(), input->size(), mSize, UNIT, mQuantScales.data(), mQuantZeroPoints.data(), core);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class CPUDequantizeLinearCreator : public CPUBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
auto dataType = inputs[0]->getType();
|
||||
if (dataType.bits != 8 && dataType.bits != 16 && dataType.bits != 32) {
|
||||
MNN_ERROR("Input of Dequantize must be int8/uint8/fp16/int32\n");
|
||||
return nullptr;
|
||||
}
|
||||
int inputBits = dataType.bits;
|
||||
int size = op->main_as_DequantizeLinear()->scaleSize();
|
||||
int axis = op->main_as_DequantizeLinear()->scaleAxis();
|
||||
if (inputs.size() > 2) {
|
||||
return new CPUDequantizeLinear(backend, inputs[1]->host<float>(), inputs[2]->host<int8_t>(), size, axis, inputBits);
|
||||
}
|
||||
return new CPUDequantizeLinear(backend, inputs[1]->host<float>(), nullptr, size, axis, inputBits);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OP_CREATOR(CPUDequantizeLinearCreator, OpType_DequantizeLinear);
|
||||
|
||||
} // namespace MNN
|
|
@ -1,81 +0,0 @@
|
|||
//
|
||||
// CPUDequantizeLinear.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/07/15.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef CPUDequantizeLinear_hpp
|
||||
#define CPUDequantizeLinear_hpp
|
||||
|
||||
#include "core/AutoStorage.h"
|
||||
#include "core/Execution.hpp"
|
||||
#include "compute/Int8FunctionsOpt.h"
|
||||
|
||||
namespace MNN {
|
||||
typedef void(*dequantFunc)(float* dst, const int8_t* source, int inputDim, int inputSize, int size, int UNIT, float* scales, int8_t* zeros, const CoreInt8Functions* core);
|
||||
class CPUDequantizeLinear : public Execution {
|
||||
public:
|
||||
CPUDequantizeLinear(Backend *b, float* scales, int8_t* zeroPoints, int size = 1, int axis = 0, int inputBits = 8);
|
||||
virtual ~CPUDequantizeLinear() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
private:
|
||||
std::vector<float> mQuantScales;
|
||||
std::vector<int8_t> mQuantZeroPoints;
|
||||
int mSize = 1;
|
||||
int mAxis = 0;
|
||||
int mInputBits = 8;
|
||||
dequantFunc mFunc;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
void dequantizeFunc(float* dst, const int8_t* source, int inputDim, int inputSize, int size, int UNIT, float* scales, int8_t* zeros, const CoreInt8Functions* core) {
|
||||
#ifdef MNN_USE_SSE
|
||||
auto src = (uint8_t*)source;
|
||||
int offset = 128;
|
||||
#else
|
||||
auto src = (int8_t*)source;
|
||||
int offset = 0;
|
||||
#endif
|
||||
// auto src = (T*)source;
|
||||
if (inputDim == 1) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
dst[i] = static_cast<float>(src[i] - zeros[i] - offset) * scales[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
int chw = 1;
|
||||
if (inputDim > 1) {
|
||||
chw = inputSize / (size * sizeof(T));
|
||||
}
|
||||
|
||||
if (size == 1) {
|
||||
if (sizeof(T) == 1) {
|
||||
core->MNNInt8ScaleToFloat(dst, (int8_t*)src, scales, chw / UNIT, zeros[0]);
|
||||
int sizeDiv = (int)chw / UNIT;
|
||||
for (int k = sizeDiv * UNIT; k < chw; ++k) {
|
||||
dst[k] = static_cast<float>(src[k] - zeros[0] - offset) * scales[0];
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < chw; ++k) {
|
||||
dst[k] = static_cast<float>(src[k] - zeros[0] - offset) * scales[0];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
std::vector<float> tmp(4, scales[i]);
|
||||
//core->MNNInt8ScaleToFloat(dst, src, tmp.data(), sizeDiv, mQuantZeroPoints[i]);
|
||||
for (int k = 0; k < chw; ++k) {
|
||||
dst[k] = static_cast<float>(src[k] - zeros[i] - offset) * scales[i];
|
||||
}
|
||||
src += chw;
|
||||
dst += chw;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MNN
|
||||
|
||||
#endif /* CPUDequantizeLinear_hpp */
|
|
@ -66,8 +66,6 @@ extern void ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
|
|||
extern void ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
|
||||
extern void ___CPUSvdCreator__OpType_Svd__();
|
||||
extern void ___CPULayerNormCreator__OpType_LayerNorm__();
|
||||
extern void ___CPUQuantizeLinearCreator__OpType_QuantizeLinear__();
|
||||
extern void ___CPUDequantizeLinearCreator__OpType_DequantizeLinear__();
|
||||
|
||||
#ifdef MNN_SUPPORT_RENDER
|
||||
extern void ___CPURasterAndInterpolateCreator__OpType_RasterAndInterpolate__();
|
||||
|
@ -146,8 +144,5 @@ ___CPURasterAndInterpolateCreator__OpType_RasterAndInterpolate__();
|
|||
___CPURasterDiffCreator__OpType_RasterDiff__();
|
||||
___CPUTextureCreator__OpType_Texture__();
|
||||
#endif
|
||||
___CPUQuantizeLinearCreator__OpType_QuantizeLinear__();
|
||||
___CPUDequantizeLinearCreator__OpType_DequantizeLinear__();
|
||||
//CPUQuantizeLinearCreator
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
//
|
||||
// CPUQuantizeLinear.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/07/15.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "backend/cpu/CPUQuantizeLinear.hpp"
|
||||
#include "compute/CommonOptFunction.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
CPUQuantizeLinear::CPUQuantizeLinear(Backend *b, int size, int axis) : MNN::Execution(b){
|
||||
mSize = size;
|
||||
mAxis = axis;
|
||||
}
|
||||
|
||||
ErrorCode CPUQuantizeLinear::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
int size = mSize;
|
||||
float* scale = inputs[1]->host<float>();
|
||||
int8_t* zero = nullptr;
|
||||
if (inputs.size() > 2) {
|
||||
zero = inputs[2]->host<int8_t>();
|
||||
}
|
||||
if (mSize == 1) {
|
||||
float s = scale[0] == 0?0: 1/ scale[0];
|
||||
mQuantScales.resize(4, s);
|
||||
if (nullptr != zero) {
|
||||
int8_t z = *zero;
|
||||
mQuantZeroPoints.resize(4, z);
|
||||
} else {
|
||||
mQuantZeroPoints.resize(4);
|
||||
}
|
||||
} else { // TODO scale: (1,D)
|
||||
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
ErrorCode CPUQuantizeLinear::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = inputs[0];
|
||||
int N = input->length(0), C = input->length(1), H = input->length(2), W = input->length(3);
|
||||
ssize_t size = N * C * H * W;
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||
int maxValue = 127;
|
||||
int minValue = -128;
|
||||
#ifdef MNN_USE_SSE
|
||||
auto dst = outputs[0]->host<uint8_t>();
|
||||
int offset = 128;
|
||||
#else
|
||||
auto dst = outputs[0]->host<int8_t>();
|
||||
int offset = 0;
|
||||
#endif
|
||||
if (mSize == 1) {
|
||||
auto src = input->host<float>();
|
||||
int sizeDiv = (int)size / UNIT;
|
||||
core->MNNFloat2Int8(src, (int8_t*)dst, size / UNIT, mQuantScales.data(), -128, 127, mQuantZeroPoints[0]);
|
||||
for (int i = sizeDiv * UNIT; i < size; ++i) {
|
||||
int v = (int)roundf(src[i] * mQuantScales[0]) + mQuantZeroPoints[0] + offset;
|
||||
v = std::max(minValue + offset, std::min(maxValue + offset, v));
|
||||
dst[i] = v;
|
||||
}
|
||||
} else {
|
||||
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class CPUQuantizeLinearCreator : public CPUBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
int size = op->main_as_QuantizeLinear()->scaleSize();
|
||||
int axis = op->main_as_QuantizeLinear()->scaleAxis();
|
||||
return new CPUQuantizeLinear(backend, size, axis);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OP_CREATOR(CPUQuantizeLinearCreator, OpType_QuantizeLinear);
|
||||
|
||||
} // namespace MNN
|
|
@ -1,31 +0,0 @@
|
|||
//
|
||||
// CPUQuantizeLinear.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/07/15.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef CPUQuantizeLinear_hpp
|
||||
#define CPUQuantizeLinear_hpp
|
||||
|
||||
#include "core/AutoStorage.h"
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
class CPUQuantizeLinear : public Execution {
|
||||
public:
|
||||
CPUQuantizeLinear(Backend *b, int size = 1, int axis = 0);
|
||||
virtual ~CPUQuantizeLinear() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
private:
|
||||
std::vector<float> mQuantScales;
|
||||
std::vector<int8_t> mQuantZeroPoints;
|
||||
int mSize = 1;
|
||||
int mAxis = 0;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
||||
#endif /* CPUQuantizeLinear_hpp */
|
|
@ -884,14 +884,14 @@ public:
|
|||
// Loop Op's command's first index must be output
|
||||
outputStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
}
|
||||
halide_type_t outputType;
|
||||
halide_type_t inputType;
|
||||
for (int v=0; v<iterIndexsize; ++v) {
|
||||
auto tensorIndex = cmd->indexes()->data()[v];
|
||||
auto tensor = mStack[tensorIndex];
|
||||
auto iterIndex = cmd->iterIndexes()->data()[v];
|
||||
auto offset = iter;
|
||||
if (0 == v) {
|
||||
outputType = tensor->getType();
|
||||
if (1 == v) {
|
||||
inputType = tensor->getType();
|
||||
}
|
||||
if (iterIndex >= 0) {
|
||||
offset = mStack[iterIndex]->host<int32_t>()[iter];
|
||||
|
@ -969,10 +969,10 @@ public:
|
|||
if (OpType_BinaryOp == op->type()) {
|
||||
auto src0 = mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
|
||||
MNNBinaryExecute proc;
|
||||
if (outputType.code == halide_type_float) {
|
||||
if (inputType.code == halide_type_float) {
|
||||
proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(op->main_as_BinaryOp()->opType());
|
||||
} else {
|
||||
MNN_ASSERT(outputType.code == halide_type_int);
|
||||
MNN_ASSERT(inputType.code == halide_type_int);
|
||||
proc = CPUBinary::selectForInt(op->main_as_BinaryOp()->opType());
|
||||
}
|
||||
auto lastS = cmd->size()->data()[2];
|
||||
|
|
|
@ -1531,6 +1531,8 @@ void cpuinfo_arm_init(struct cpuinfo_arm_isa* cpuinfo_isa) {
|
|||
cpu_family == CPUFAMILY_ARM_EVEREST_SAWTOOTH ||
|
||||
cpu_family == CPUFAMILY_ARM_PCORE_ECORE_COLL;
|
||||
|
||||
cpuinfo_isa->i8mm = cpu_family == CPUFAMILY_ARM_EVEREST_SAWTOOTH ||
|
||||
cpu_family == CPUFAMILY_ARM_PCORE_ECORE_COLL;
|
||||
#endif // iOS
|
||||
|
||||
// arm64-osx
|
||||
|
|
|
@ -45,20 +45,10 @@ ErrorCode CPUUnary::onResize(const std::vector<Tensor *> &inputs, const std::vec
|
|||
static void _Neg(void* out, const void* inp, int realSize) {
|
||||
MNNScaleAndAddBiasScalar((float*)out, (const float*)inp, 0.0f, -1.0f, realSize);
|
||||
}
|
||||
|
||||
static void _NegInt8(void* out, const void* inp, int realSize, QuanPrePostParameters* params) {
|
||||
int sizeDiv16 = realSize / 16;
|
||||
int start = 0;
|
||||
#ifdef MNN_USE_NEON
|
||||
int8_t* outPtr = (int8_t*)out;
|
||||
int8_t* inPtr = (int8_t*)inp;
|
||||
int8x8_t inZeroPoint = vdup_n_s8(params->inputZeroPoint[0]);
|
||||
int8x8_t outZeroPoint = vdup_n_s8(params->outputZeroPoint[0]);
|
||||
float32x4_t inpScale = vdupq_n_f32(params->inputScale[0]);
|
||||
float32x4_t outScale = vdupq_n_f32(params->outputScale[0]);
|
||||
if (sizeDiv16 > 0) {
|
||||
for (int i = 0;i < sizeDiv16; ++i) {
|
||||
int8x16_t negValue = vld1q_s8(inPtr);
|
||||
static inline void exeNegInt8 (int8_t* out, const int8_t* inp, int sizeQuad, int8x8_t inZeroPoint, int8x8_t outZeroPoint, float32x4_t inpScale, float32x4_t outScale) {
|
||||
for (int i = 0;i < sizeQuad; ++i) {
|
||||
int8x16_t negValue = vld1q_s8(inp);
|
||||
int16x8_t val16_0 = vmovl_s8(vget_low_s8(negValue));
|
||||
int16x8_t val16_1 = vmovl_s8(vget_high_s8(negValue));
|
||||
val16_0 = vsubw_s8(val16_0, inZeroPoint);
|
||||
|
@ -98,14 +88,34 @@ static void _NegInt8(void* out, const void* inp, int realSize, QuanPrePostParame
|
|||
int8x8_t v8_0 = vqmovn_s16(v16_4);
|
||||
int8x8_t v8_1 = vqmovn_s16(v16_5);
|
||||
|
||||
vst1_s8(outPtr, v8_0);
|
||||
vst1_s8(outPtr + 8, v8_1);
|
||||
inPtr += 16;
|
||||
outPtr += 16;
|
||||
vst1_s8(out, v8_0);
|
||||
vst1_s8(out + 8, v8_1);
|
||||
inp += 16;
|
||||
out += 16;
|
||||
}
|
||||
start = 16 * sizeDiv16;
|
||||
}
|
||||
#endif
|
||||
static void _NegInt8(void* out, const void* inp, int realSize, QuanPrePostParameters* params) {
|
||||
int sizeDiv16 = realSize / 16;
|
||||
int remain = realSize % 16;
|
||||
#ifdef MNN_USE_NEON
|
||||
int8_t* outPtr = (int8_t*)out;
|
||||
int8_t* inPtr = (int8_t*)inp;
|
||||
int8x8_t inZeroPoint = vdup_n_s8(params->inputZeroPoint[0]);
|
||||
int8x8_t outZeroPoint = vdup_n_s8(params->outputZeroPoint[0]);
|
||||
float32x4_t inpScale = vdupq_n_f32(params->inputScale[0]);
|
||||
float32x4_t outScale = vdupq_n_f32(params->outputScale[0]);
|
||||
if (sizeDiv16 > 0) {
|
||||
exeNegInt8(outPtr, inPtr, sizeDiv16, inZeroPoint, outZeroPoint, inpScale, outScale);
|
||||
}
|
||||
if (remain > 0) {
|
||||
int8_t intmp[16] = {0};
|
||||
int8_t outmp[16] = {0};
|
||||
::memcpy(intmp, reinterpret_cast<const int8_t*>(inp) + 16 * sizeDiv16, remain * sizeof(int8_t));
|
||||
exeNegInt8(outmp, intmp, 1, inZeroPoint, outZeroPoint, inpScale, outScale);
|
||||
::memcpy(reinterpret_cast<int8_t*>(out) + 16 * sizeDiv16, outmp, remain * sizeof(int8_t));
|
||||
}
|
||||
#else
|
||||
#ifdef MNN_USE_SSE
|
||||
uint8_t* dst = (uint8_t*)out;
|
||||
uint8_t* src = (uint8_t*)inp;
|
||||
|
@ -121,7 +131,7 @@ static void _NegInt8(void* out, const void* inp, int realSize, QuanPrePostParame
|
|||
float outscale_ = params->outputScale[0];
|
||||
int min_ = static_cast<int>(params->minValue);
|
||||
int max_ = static_cast<int>(params->maxValue);
|
||||
for (int i = start; i < realSize; ++i) {
|
||||
for (int i = 0; i < realSize; ++i) {
|
||||
int value = -(src[i] - inzero_ - offset) * inscale_ * outscale_ + outzero_;
|
||||
if (value > max_) {
|
||||
value = max_;
|
||||
|
@ -131,24 +141,16 @@ static void _NegInt8(void* out, const void* inp, int realSize, QuanPrePostParame
|
|||
}
|
||||
dst[i] = value + offset;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void _ABS(void* out, const void* inp, int realSize) {
|
||||
MNNReluWithSlopeCommon((float*)out, (const float*)inp, realSize, -1.0f);
|
||||
}
|
||||
static void _ABSInt8(void* out, const void* inp, int realSize, QuanPrePostParameters* params) {
|
||||
int sizeDiv16 = realSize / 16;
|
||||
int start = 0;
|
||||
#ifdef MNN_USE_NEON
|
||||
int8_t* outPtr = (int8_t*)out;
|
||||
int8_t* inPtr = (int8_t*)inp;
|
||||
int8x8_t inZeroPoint = vdup_n_s8(params->inputZeroPoint[0]);
|
||||
int8x8_t outZeroPoint = vdup_n_s8(params->outputZeroPoint[0]);
|
||||
float32x4_t inpScale = vdupq_n_f32(params->inputScale[0]);
|
||||
float32x4_t outScale = vdupq_n_f32(params->outputScale[0]);
|
||||
if (sizeDiv16 > 0) {
|
||||
for (int i = 0;i < sizeDiv16; ++i) {
|
||||
int8x16_t absValue = vld1q_s8(inPtr);
|
||||
static inline void exeAbsInt8(int8_t* out, const int8_t* inp, int sizeQuad, int8x8_t inZeroPoint, int8x8_t outZeroPoint, float32x4_t inpScale, float32x4_t outScale) {
|
||||
for (int i = 0;i < sizeQuad; ++i) {
|
||||
int8x16_t absValue = vld1q_s8(inp);
|
||||
int16x8_t val16_0 = vmovl_s8(vget_low_s8(absValue));
|
||||
int16x8_t val16_1 = vmovl_s8(vget_high_s8(absValue));
|
||||
val16_0 = vsubw_s8(val16_0, inZeroPoint);
|
||||
|
@ -188,14 +190,34 @@ static void _ABSInt8(void* out, const void* inp, int realSize, QuanPrePostParame
|
|||
int8x8_t v8_0 = vqmovn_s16(v16_4);
|
||||
int8x8_t v8_1 = vqmovn_s16(v16_5);
|
||||
|
||||
vst1_s8(outPtr, v8_0);
|
||||
vst1_s8(outPtr + 8, v8_1);
|
||||
inPtr += 16;
|
||||
outPtr += 16;
|
||||
vst1_s8(out, v8_0);
|
||||
vst1_s8(out + 8, v8_1);
|
||||
inp += 16;
|
||||
out += 16;
|
||||
}
|
||||
start = 16 * sizeDiv16;
|
||||
}
|
||||
#endif
|
||||
static void _ABSInt8(void* out, const void* inp, int realSize, QuanPrePostParameters* params) {
|
||||
int sizeDiv16 = realSize / 16;
|
||||
int remain = realSize % 16;
|
||||
#ifdef MNN_USE_NEON
|
||||
int8_t* outPtr = (int8_t*)out;
|
||||
int8_t* inPtr = (int8_t*)inp;
|
||||
int8x8_t inZeroPoint = vdup_n_s8(params->inputZeroPoint[0]);
|
||||
int8x8_t outZeroPoint = vdup_n_s8(params->outputZeroPoint[0]);
|
||||
float32x4_t inpScale = vdupq_n_f32(params->inputScale[0]);
|
||||
float32x4_t outScale = vdupq_n_f32(params->outputScale[0]);
|
||||
if (sizeDiv16 > 0) {
|
||||
exeAbsInt8(outPtr, inPtr, sizeDiv16, inZeroPoint, outZeroPoint, inpScale, outScale);
|
||||
}
|
||||
if (remain > 0) {
|
||||
int8_t intmp[16] = {0};
|
||||
int8_t outmp[16] = {0};
|
||||
::memcpy(intmp, reinterpret_cast<const int8_t*>(inp) + 16 * sizeDiv16, remain * sizeof(int8_t));
|
||||
exeAbsInt8(outmp, intmp, 1, inZeroPoint, outZeroPoint, inpScale, outScale);
|
||||
::memcpy(reinterpret_cast<int8_t*>(out) + 16 * sizeDiv16, outmp, remain * sizeof(int8_t));
|
||||
}
|
||||
#else
|
||||
#ifdef MNN_USE_SSE
|
||||
uint8_t* dst = (uint8_t*)out;
|
||||
uint8_t* src = (uint8_t*)inp;
|
||||
|
@ -207,7 +229,7 @@ static void _ABSInt8(void* out, const void* inp, int realSize, QuanPrePostParame
|
|||
#endif
|
||||
int inzero_ = static_cast<int>(params->inputZeroPoint[0]);
|
||||
int outzero_ = static_cast<int>(params->outputZeroPoint[0]);
|
||||
for (int i = start; i < realSize; ++i) {
|
||||
for (int i = 0; i < realSize; ++i) {
|
||||
auto value = abs((src[i] - inzero_ - offset) * params->inputScale[0]);
|
||||
value = value * params->outputScale[0] + outzero_;
|
||||
if (value > params->maxValue) {
|
||||
|
@ -218,11 +240,38 @@ static void _ABSInt8(void* out, const void* inp, int realSize, QuanPrePostParame
|
|||
}
|
||||
dst[i] = value + offset;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef MNN_USE_NEON
|
||||
static inline void exeSignInt8 (int8_t* out, const int8_t* inp, int sizeQuad, int16x8_t one, int16x8_t negone, int16x8_t zero, int8x8_t inZeroPoint, int8x8_t outZeroPoint, float32x4_t outScale) {
|
||||
for (int i = 0;i < sizeQuad; ++i) {
|
||||
int8x16_t value = vld1q_s8(inp);
|
||||
int16x8_t vallow = vmovl_s8(vget_low_s8(value));
|
||||
int16x8_t valhi = vmovl_s8(vget_high_s8(value));
|
||||
vallow = vsubw_s8(vallow, inZeroPoint);
|
||||
valhi = vsubw_s8(valhi, inZeroPoint);
|
||||
uint16x8_t lomask1 = vcgtq_s16(vallow, zero);
|
||||
uint16x8_t lomask_1 = vcltq_s16(vallow, zero);
|
||||
uint16x8_t himask1 = vcgtq_s16(valhi, zero);
|
||||
uint16x8_t himask_1 = vcltq_s16(valhi, zero);
|
||||
uint16x8_t zeromask_low = vceqq_u16(lomask1, lomask_1);
|
||||
uint16x8_t zeromask_hi = vceqq_u16(himask1, himask_1);
|
||||
vallow = vbslq_s16(lomask1, one, negone);
|
||||
vallow = vbslq_s16(zeromask_low, zero, vallow);
|
||||
valhi = vbslq_s16(himask1, one, negone);
|
||||
valhi = vbslq_s16(zeromask_hi, zero, valhi);
|
||||
int8x8_t v8_0 = vqmovn_s16(vallow);
|
||||
int8x8_t v8_1 = vqmovn_s16(valhi);
|
||||
vst1_s8(out, v8_0);
|
||||
vst1_s8(out + 8, v8_1);
|
||||
inp += 16;
|
||||
out += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
static void _SignInt8(void* out, const void* inp, int realSize, QuanPrePostParameters* params) {
|
||||
int sizeDiv16 = realSize / 16;
|
||||
int start = 0;
|
||||
int remain = realSize % 16;
|
||||
#ifdef MNN_USE_NEON
|
||||
int8_t* outPtr = (int8_t*)out;
|
||||
int8_t* inPtr = (int8_t*)inp;
|
||||
|
@ -233,54 +282,16 @@ static void _SignInt8(void* out, const void* inp, int realSize, QuanPrePostParam
|
|||
int8x8_t outZeroPoint = vdup_n_s8(params->outputZeroPoint[0]);
|
||||
float32x4_t outScale = vdupq_n_f32(params->outputScale[0]);
|
||||
if (sizeDiv16 > 0) {
|
||||
for (int i = 0;i < sizeDiv16; ++i) {
|
||||
int8x16_t value = vld1q_s8(inPtr);
|
||||
int16x8_t vallow = vmovl_s8(vget_low_s8(value));
|
||||
int16x8_t valhi = vmovl_s8(vget_high_s8(value));
|
||||
vallow = vsubw_s8(vallow, inZeroPoint);
|
||||
valhi = vsubw_s8(valhi, inZeroPoint);
|
||||
uint16x8_t lomask1 = vcgtq_s16(vallow, zero);
|
||||
uint16x8_t lomask_1 = vcltq_s16(vallow, zero);
|
||||
uint16x8_t himask1 = vcgtq_s16(valhi, zero);
|
||||
uint16x8_t himask_1 = vcltq_s16(valhi, zero);
|
||||
vallow = vbslq_s16(lomask1, vallow, one);
|
||||
vallow = vbslq_s16(lomask_1, vallow, negone);
|
||||
valhi = vbslq_s16(himask1, valhi, one);
|
||||
valhi = vbslq_s16(himask_1, valhi, negone);
|
||||
int32x4_t val32_00 = vmovl_s16(vget_low_s16(vallow));
|
||||
int32x4_t val32_01 = vmovl_s16(vget_high_s16(vallow));
|
||||
int32x4_t val32_10 = vmovl_s16(vget_low_s16(valhi));
|
||||
int32x4_t val32_11 = vmovl_s16(vget_high_s16(valhi));
|
||||
float32x4_t valF_00 = vcvtq_f32_s32(val32_00);
|
||||
float32x4_t valF_01 = vcvtq_f32_s32(val32_01);
|
||||
float32x4_t valF_10 = vcvtq_f32_s32(val32_10);
|
||||
float32x4_t valF_11 = vcvtq_f32_s32(val32_11);
|
||||
valF_00 = vmulq_f32(valF_00, outScale);
|
||||
valF_01 = vmulq_f32(valF_01, outScale);
|
||||
valF_10 = vmulq_f32(valF_10, outScale);
|
||||
valF_11 = vmulq_f32(valF_11, outScale);
|
||||
int32x4_t val_00 = vcvtq_s32_f32(valF_00);
|
||||
int32x4_t val_01 = vcvtq_s32_f32(valF_01);
|
||||
int32x4_t val_10 = vcvtq_s32_f32(valF_10);
|
||||
int32x4_t val_11 = vcvtq_s32_f32(valF_11);
|
||||
int16x4_t v16_0 = vqmovn_s32(val_00);
|
||||
int16x4_t v16_1 = vqmovn_s32(val_01);
|
||||
int16x4_t v16_2 = vqmovn_s32(val_10);
|
||||
int16x4_t v16_3 = vqmovn_s32(val_11);
|
||||
int16x8_t v16_4 = vcombine_s16(v16_0, v16_1);
|
||||
int16x8_t v16_5 = vcombine_s16(v16_2, v16_3);
|
||||
v16_4 = vaddw_s8(v16_4, outZeroPoint);
|
||||
v16_5 = vaddw_s8(v16_5, outZeroPoint);
|
||||
int8x8_t v8_0 = vqmovn_s16(v16_4);
|
||||
int8x8_t v8_1 = vqmovn_s16(v16_5);
|
||||
vst1_s8(outPtr, v8_0);
|
||||
vst1_s8(outPtr + 8, v8_1);
|
||||
inPtr += 16;
|
||||
outPtr += 16;
|
||||
exeSignInt8(outPtr, inPtr, sizeDiv16, one, negone, zero, inZeroPoint, outZeroPoint, outScale);
|
||||
}
|
||||
start = 16 * sizeDiv16;
|
||||
if (remain > 0) {
|
||||
int8_t intmp[16] = {0};
|
||||
int8_t outmp[16] = {0};
|
||||
::memcpy(intmp, reinterpret_cast<const int8_t*>(inp) + 16 * sizeDiv16, remain * sizeof(int8_t));
|
||||
exeSignInt8(outmp, intmp, 1, one, negone, zero, inZeroPoint, outZeroPoint, outScale);
|
||||
::memcpy(reinterpret_cast<int8_t*>(out) + 16 * sizeDiv16, outmp, remain * sizeof(int8_t));
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef MNN_USE_SSE
|
||||
uint8_t* dst = (uint8_t*)out;
|
||||
uint8_t* src = (uint8_t*)inp;
|
||||
|
@ -292,7 +303,7 @@ static void _SignInt8(void* out, const void* inp, int realSize, QuanPrePostParam
|
|||
#endif
|
||||
int inzero_ = static_cast<int>(params->inputZeroPoint[0]);
|
||||
int outzero_ = static_cast<int>(params->outputZeroPoint[0]);
|
||||
for (int i = start; i < realSize; ++i) {
|
||||
for (int i = 0; i < realSize; ++i) {
|
||||
auto value = src[i] - offset - inzero_;
|
||||
if (value > 0) {
|
||||
int f = 1 * params->outputScale[0] + outzero_;
|
||||
|
@ -304,6 +315,7 @@ static void _SignInt8(void* out, const void* inp, int realSize, QuanPrePostParam
|
|||
dst[i] = outzero_ + offset;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void _Square(void* out, const void* inp, int realSize) {
|
||||
|
|
|
@ -45,8 +45,8 @@ vdup.32 q10, r8 //q10: [28.f]x4
|
|||
vdup.32 q9, r10 //q9: [3150.f]x4
|
||||
vdup.32 q8, r11 //q8: [62370.f]x4
|
||||
|
||||
mov r4, #5.0
|
||||
mov r5, #-5.0
|
||||
mov r4, #5
|
||||
mov r5, #-5
|
||||
|
||||
GeluZLoop:
|
||||
|
||||
|
@ -68,6 +68,8 @@ vmul.f32 q3, q3, q14 // value
|
|||
// if value > 5, then value=5; if value<-5, then value=-5
|
||||
vdup.32 q7, r4
|
||||
vdup.32 q6, r5
|
||||
vcvt.f32.s32 q7, q7
|
||||
vcvt.f32.s32 q6, q6
|
||||
vmax.f32 q2, q2, q6
|
||||
vmax.f32 q3, q3, q6
|
||||
vmin.f32 q2, q2, q7
|
||||
|
|
|
@ -45,8 +45,8 @@ vdup.32 q10, r8 //q10: [28.f]x4
|
|||
vdup.32 q9, r10 //q9: [3150.f]x4
|
||||
vdup.32 q8, r11 //q8: [62370.f]x4
|
||||
|
||||
mov r4, #5.0
|
||||
mov r5, #-5.0
|
||||
mov r4, #5
|
||||
mov r5, #-5
|
||||
|
||||
GeluZLoop:
|
||||
|
||||
|
@ -70,6 +70,8 @@ vmul.f32 q3, q3, q14
|
|||
|
||||
vdup.32 q7, r4
|
||||
vdup.32 q6, r5
|
||||
vcvt.f32.s32 q7, q7
|
||||
vcvt.f32.s32 q6, q6
|
||||
vmax.f32 q2, q2, q6
|
||||
vmax.f32 q3, q3, q6
|
||||
vmin.f32 q2, q2, q7
|
||||
|
|
|
@ -2595,35 +2595,20 @@ void MNNPackTranspose(float* dst, const float* src, size_t area, size_t depth, i
|
|||
}
|
||||
|
||||
void MNNExp(float* dst, const float* src, const float* offset, size_t dataSize) {
|
||||
int countC8 = (int)dataSize / 8;
|
||||
if (countC8 > 0) {
|
||||
// Align to eight so asm is easier to write
|
||||
int countC8 = static_cast<int32_t>(dataSize) / 8;
|
||||
int remain = static_cast<int32_t>(dataSize) % 8;
|
||||
float parameters[] = {
|
||||
(float)logf(2.0f), 1.0f / (float)logf(2.0f), 1.0f, 1.0f, 0.5f, 1.0f / 6.0f, 1.0f / 24.0f, 1.0f / 120.0f};
|
||||
if (countC8 > 0) {
|
||||
// Align to eight so asm is easier to write
|
||||
MNNExpC8(dst, src, offset, parameters, countC8);
|
||||
}
|
||||
float alpha = offset[0];
|
||||
float beta = offset[1];
|
||||
int remain = countC8 * 8;
|
||||
auto param = logf(2.0f);
|
||||
float xLimit = 87;
|
||||
for (int i = remain; i < dataSize; i++) {
|
||||
/*Origin Function*/
|
||||
//dst[i] = expf(src[i] * alpha) + beta;
|
||||
/*Approciate Function*/
|
||||
|
||||
auto x = alpha * src[i];
|
||||
x = ALIMAX(x, -xLimit);
|
||||
x = ALIMIN(x, xLimit);
|
||||
|
||||
int div = (x / param);
|
||||
int div2 = (div + 127) << 23;
|
||||
auto xReamin = x - div * param;
|
||||
float expBasic = *(float*)(&div2);
|
||||
|
||||
auto t = xReamin;
|
||||
auto expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f;
|
||||
dst[i] = expBasic * expRemain + beta;
|
||||
if (remain > 0) {
|
||||
float intmp[8] = {0};
|
||||
float outmp[8] = {0};
|
||||
::memcpy(intmp, src + 8 * countC8, remain * sizeof(float));
|
||||
MNNExpC8(outmp, intmp, offset, parameters, 1);
|
||||
::memcpy(dst + 8 * countC8, outmp, remain * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2670,30 +2655,33 @@ void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope
|
|||
}
|
||||
|
||||
void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slope) {
|
||||
int sizeQuad = size / 4;
|
||||
int start = 0;
|
||||
int sizeQuad = static_cast<int32_t>(size) / 4;
|
||||
int remain = static_cast<int32_t>(size) % 4;
|
||||
if (sizeQuad > 0) {
|
||||
MNNReluWithSlope(dst, src, sizeQuad, slope);
|
||||
start = sizeQuad * 4;
|
||||
}
|
||||
for (int j = start; j < size; j++) {
|
||||
if (src[j] < 0) {
|
||||
dst[j] = src[j] * slope;
|
||||
} else {
|
||||
dst[j] = src[j];
|
||||
}
|
||||
if (remain > 0) {
|
||||
float intmp[4] = {0}, outmp[4] = {0};
|
||||
::memcpy(intmp, src + sizeQuad * 4, remain * sizeof(float));
|
||||
MNNReluWithSlope(outmp, intmp, 1, slope);
|
||||
::memcpy(dst + sizeQuad * 4, outmp, remain * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
void MNNHardSwishCommon(float* dst, const float* src, size_t size) {
|
||||
int sizeQuad = static_cast<int32_t>(size / 4);
|
||||
int start = 0;
|
||||
int remain = static_cast<int32_t>(size) % 4;
|
||||
#ifdef MNN_USE_SSE
|
||||
if (sizeQuad > 0) {
|
||||
MNNHardSwish(dst, src, sizeQuad);
|
||||
start = sizeQuad * 4;
|
||||
}
|
||||
#endif
|
||||
if (remain > 0) {
|
||||
float intmp[4] = {0}, outmp[4] = {0};
|
||||
::memcpy(intmp, src + sizeQuad * 4, remain * sizeof(float));
|
||||
MNNHardSwish(outmp, intmp, 1);
|
||||
::memcpy(dst + sizeQuad * 4, outmp, remain * sizeof(float));
|
||||
}
|
||||
#else
|
||||
#ifdef MNN_USE_NEON
|
||||
float32x4_t zero = vdupq_n_f32(0.f);
|
||||
float32x4_t three = vdupq_n_f32(3.f);
|
||||
|
@ -2704,9 +2692,16 @@ void MNNHardSwishCommon(float* dst, const float* src, size_t size) {
|
|||
auto y = vmulq_f32(vmulq_f32(x, vminq_f32(vmaxq_f32(vaddq_f32(x, three), zero), six)), divsix);
|
||||
vst1q_f32(dst + 4 * i, y);
|
||||
}
|
||||
start = sizeQuad * 4;
|
||||
#endif
|
||||
for (int j = start; j < size; j++) {
|
||||
if (remain > 0) {
|
||||
float intmp[4] = {0}, outmp[4] = {0};
|
||||
::memcpy(intmp, src + sizeQuad * 4, remain * sizeof(float));
|
||||
auto x = vld1q_f32(intmp);
|
||||
auto y = vmulq_f32(vmulq_f32(x, vminq_f32(vmaxq_f32(vaddq_f32(x, three), zero), six)), divsix);
|
||||
vst1q_f32(outmp, y);
|
||||
::memcpy(dst + sizeQuad * 4, outmp, remain * sizeof(float));
|
||||
}
|
||||
#else
|
||||
for (int j = 0; j < size; j++) {
|
||||
if (src[j] <= -3) {
|
||||
dst[j] = 0;
|
||||
} else if (src[j] >= 3){
|
||||
|
@ -2715,6 +2710,8 @@ void MNNHardSwishCommon(float* dst, const float* src, size_t size) {
|
|||
dst[j] = src[j] * (src[j] + 3) / 6.f;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void MNNGeluStandardCommon(float* dst, const float* src, size_t size) {
|
||||
|
@ -2725,14 +2722,20 @@ void MNNGeluStandardCommon(float* dst, const float* src, size_t size) {
|
|||
|
||||
void MNNGeluCommon(float* dst, const float* src, size_t size) {
|
||||
int sizeQuad = static_cast<int32_t>(size / 8);
|
||||
int start = 0;
|
||||
int remain = static_cast<int32_t>(size) % 8;
|
||||
#if defined(MNN_USE_SSE) || defined(MNN_USE_NEON)
|
||||
if (sizeQuad > 0) {
|
||||
float parameters[8] = {0.044715f, 0.79788458f, 378.f, 17325.f, 135135.f, 28.f, 3150.f, 62370.f};
|
||||
if (sizeQuad > 0) {
|
||||
MNNGelu(dst, src, sizeQuad, parameters);
|
||||
start = sizeQuad * 8;
|
||||
}
|
||||
#endif
|
||||
if (remain > 0) {
|
||||
float intmp[8] = {0};
|
||||
float outmp[8] = {0};
|
||||
::memcpy(intmp, src + 8 * sizeQuad, remain * sizeof(float));
|
||||
MNNGelu(outmp, intmp, 1, parameters);
|
||||
::memcpy(dst + 8 * sizeQuad, outmp, remain * sizeof(float));
|
||||
}
|
||||
#else
|
||||
auto tanhf_poly = [](float value) -> float {
|
||||
if (value > 5.0f) {
|
||||
return 1.0f;
|
||||
|
@ -2745,11 +2748,12 @@ void MNNGeluCommon(float* dst, const float* src, size_t size) {
|
|||
return a / b;
|
||||
}
|
||||
};
|
||||
for (int i = start; i < size; i++) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
float temp = 0.044715f * src[i] * src[i] * src[i];
|
||||
temp = 0.79788458f * (temp + src[i]);
|
||||
dst[i] = (1.0f + tanhf_poly(temp)) * src[i] * 0.5f;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number) {
|
||||
|
@ -3056,11 +3060,13 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
|
|||
};
|
||||
MNNExp(dst, src, offset, dataSize);
|
||||
#ifdef MNN_USE_NEON
|
||||
int dataC4 = (int)dataSize / 4;
|
||||
if(dataC4 > 0) {
|
||||
// neon optimization for sigmid cpu
|
||||
int dataC4 = static_cast<int32_t>(dataSize) / 4;
|
||||
int remain = static_cast<int32_t>(dataSize) % 4;
|
||||
float32x4_t value = vdupq_n_f32(1.0f);
|
||||
|
||||
if(dataC4 > 0) {
|
||||
float32x4_t out = vld1q_f32(dst);
|
||||
// neon optimization for sigmid cpu
|
||||
for (int i = 1; i < dataC4; ++i) {
|
||||
out = vrecpeq_f32(vaddq_f32(value,out));
|
||||
vst1q_f32(dst ,out);
|
||||
|
@ -3070,12 +3076,20 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
|
|||
out = vrecpeq_f32(vaddq_f32(value,out));
|
||||
vst1q_f32(dst, out);
|
||||
dst += 4;
|
||||
dataSize = dataSize - 4 * dataC4;
|
||||
}
|
||||
#endif
|
||||
if (remain > 0) {
|
||||
float intmp[4] = {0};
|
||||
::memcpy(intmp, dst, remain * sizeof(float));
|
||||
float32x4_t out = vld1q_f32(intmp);
|
||||
out = vrecpeq_f32(vaddq_f32(value,out));
|
||||
vst1q_f32(intmp, out);
|
||||
::memcpy(dst, intmp, remain * sizeof(float));
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < dataSize; ++i) {
|
||||
dst[i] = 1.0f / (1.0f + dst[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
|
||||
|
|
|
@ -231,6 +231,10 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
|||
} else {
|
||||
quanParam.minValue = mMutableResource.mClampMin;
|
||||
}
|
||||
int dstBytes = static_cast<CPUBackend*>(backend())->getBytes(backend(), output);
|
||||
if (dstBytes != 1) {
|
||||
quanParam.useInt8 = 0;
|
||||
}
|
||||
//MNN_PRINT("max: %d, min: %d\n", quanParam.maxValue, quanParam.minValue);
|
||||
const int col_buffer_unit_size = mIm2ColParamter.kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
|
||||
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
|
||||
|
@ -262,13 +266,13 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
|||
if (number > 0) {
|
||||
blitProc(colAddr, srcPtr, info, el);
|
||||
}
|
||||
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit;
|
||||
auto outputInTilePtr = outputDataPtr + xIndexStart * PackUnit * dstBytes;
|
||||
auto colAddrTemp = colAddr;
|
||||
do {
|
||||
int step = ALIMIN(DST_XUNIT, realDstCount);
|
||||
mGemmKernel(outputInTilePtr, colAddrTemp, weightDataPtr, kernelCountUnitDouble, dstZStep, ocDiv4, &quanParam, step);
|
||||
mGemmKernel(outputInTilePtr, colAddrTemp, weightDataPtr, kernelCountUnitDouble, dstZStep * dstBytes, ocDiv4, &quanParam, step);
|
||||
realDstCount-=step;
|
||||
outputInTilePtr += DST_XUNIT * PackUnit;
|
||||
outputInTilePtr += DST_XUNIT * PackUnit * dstBytes;
|
||||
colAddrTemp += col_buffer_unit_size;
|
||||
} while(realDstCount > 0);
|
||||
}
|
||||
|
|
|
@ -110,9 +110,6 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
|||
auto matrixSizeE = output->height() * output->width() * input->batch();
|
||||
auto outputPlane = output->height() * output->width();
|
||||
mUnits.clear();
|
||||
auto inputPtr = TensorUtils::getDescribe(input)->mem->chunk();
|
||||
auto outputPtr = TensorUtils::getDescribe(output)->mem->chunk();
|
||||
|
||||
std::shared_ptr<char> __autoFunction;
|
||||
auto padY = mPadY;
|
||||
auto padX = mPadX;
|
||||
|
@ -156,9 +153,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
|||
int e = planeSize;
|
||||
int l = ic;
|
||||
int h = oc;
|
||||
auto aPtr = inputPtr + core->pack * planeStart * bytes;
|
||||
uint8_t* aPtr = nullptr;
|
||||
auto bPtr = TensorUtils::getDescribe(weightTensor)->mem->chunk();;
|
||||
auto cPtr = outputPtr + core->pack * planeStart * bytes;
|
||||
uint8_t* cPtr = nullptr;
|
||||
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk();
|
||||
memoryPool->beginGroup();
|
||||
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
||||
|
@ -200,9 +197,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
|||
int e = matrixSizeE;
|
||||
int l = ic;
|
||||
int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
|
||||
auto aPtr = inputPtr;
|
||||
uint8_t* aPtr = nullptr;
|
||||
auto bPtr = TensorUtils::getDescribe(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * bytes;
|
||||
auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes;
|
||||
uint8_t* cPtr = nullptr;
|
||||
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes;
|
||||
memoryPool->beginGroup();
|
||||
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
||||
|
|
|
@ -1453,7 +1453,9 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
|
|||
for (int j = 0; j < GEMM_INT8_UNIT; ++j) {
|
||||
const auto weight_j = weight_sz + j * GEMM_INT8_SRC_UNIT;
|
||||
for (int i = 0; i < GEMM_INT8_SRC_UNIT; ++i) {
|
||||
// if (j == 2) printf("%d, %d\n", (int32_t)src_z[i], (int32_t)weight_j[i]);
|
||||
dstTemp[j] += (int32_t)src_z[i] * (int32_t)weight_j[i];
|
||||
// if (j == 0) printf("%d\n", dstTemp[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,7 +6,6 @@ if(MNN_CUDA_PROFILE)
|
|||
set(EXTRA_LIBS -lnvToolsExt)
|
||||
endif()
|
||||
|
||||
|
||||
if(CUDA_FOUND)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_FORCE_INLINES -Wno-deprecated-gpu-targets -w ${EXTRA_LIBS}")
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
|
@ -52,6 +51,7 @@ if(CUDA_FOUND)
|
|||
ENDIF()
|
||||
|
||||
# Limit minimum cuda version for each archs
|
||||
|
||||
IF (${arch_count} EQUAL 1)
|
||||
IF ((CUDA_ARCH_FLAGS_readable_code VERSION_GREATER "80") OR (CUDA_ARCH_FLAGS_readable_code VERSION_EQUAL "80"))
|
||||
IF (CUDA_VERSION VERSION_LESS "11.2")
|
||||
|
|
|
@ -52,7 +52,8 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
int stride1[3] = {0, 0, s1};
|
||||
int stride2[3] = {0, 0, 1};
|
||||
|
||||
auto type = outputs[0]->getType();
|
||||
// use input type. output type maybe fixed, for example greater/less
|
||||
auto type = inputs[0]->getType();
|
||||
if (type.code == halide_type_float) {
|
||||
// Use Half or float
|
||||
type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
|
||||
|
|
|
@ -204,8 +204,9 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
} else if (mGpuComputeCap < 75) {
|
||||
return callCutlassGemmTensorCore884(inputs, outputs);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_CUDA_TUNE_PARAM
|
||||
if (mGpuComputeCap >= 80) {
|
||||
mIsTuned = true;
|
||||
/*
|
||||
// 0 -> Gemm, 1~N -> BatchGemm
|
||||
int32_t batchSize = 0;
|
||||
|
@ -243,10 +244,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
// set preferd block shape argments
|
||||
setGemmTensorCoreFloat16Argments(&mInfo);
|
||||
return NO_ERROR;
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
|
||||
return callCutlassGemmTensorCore(inputs, outputs);
|
||||
#endif
|
||||
}
|
||||
|
||||
ErrorCode ConvCutlassExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
|
||||
|
|
|
@ -310,6 +310,8 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
|
|||
//MNN_PRINT("Winograd BatchGemm batch:%d, MNK:%d-%d-%d\n", mBlock2, mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);
|
||||
if(mFp16Infer) {
|
||||
#ifdef ENABLE_CUDA_TUNE_PARAM
|
||||
if(mGpuComputeCap >= 80 ) {
|
||||
mIsTuned = true;
|
||||
/*
|
||||
// 0 -> Gemm, 1~N -> BatchGemm
|
||||
int32_t batchSize = 0;
|
||||
|
@ -349,8 +351,9 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
|
|||
getGemmBatchedTensorCoreFloat16Param(&mInfo);
|
||||
// set preferd block shape argments
|
||||
setGemmBatchedTensorCoreFloat16Argments(&mInfo);
|
||||
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
if(!mIsTuned) {
|
||||
typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mBtdB_Buffer, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A
|
||||
|
@ -378,7 +381,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
|
|||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16F16LnSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
|
||||
typename GemmBatchedTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
|
@ -478,11 +481,14 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
|
|||
cutlass_check(status);
|
||||
} else {
|
||||
#ifdef ENABLE_CUDA_TUNE_PARAM
|
||||
if(mIsTuned) {
|
||||
runGemmBatchedTensorCoreFloat16Infer(&mInfo);
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
if(!mIsTuned) {
|
||||
cutlass::Status status = mGemmBatchedF16F16LnSm75();
|
||||
cutlass_check(status);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,6 +71,7 @@ private:
|
|||
int mPadY;
|
||||
int mBlock2;
|
||||
int mGpuComputeCap;
|
||||
bool mIsTuned =false;
|
||||
int mActivationType;
|
||||
bool mFp16Infer = false;
|
||||
bool mFp32Infer = false;
|
||||
|
|
|
@ -481,6 +481,8 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
|
|||
|
||||
if(mFp16Infer) {
|
||||
#ifdef ENABLE_CUDA_TUNE_PARAM
|
||||
if(mGpuComputeCap >= 80) {
|
||||
mIsTuned = true;
|
||||
/*
|
||||
// 0 -> Gemm, 1~N -> BatchGemm
|
||||
int32_t batchSize = 0;
|
||||
|
@ -555,7 +557,9 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
|
|||
|
||||
// set preferd block shape argments
|
||||
setGemmBatchedTensorCoreFloat16Argments(&mInfo);
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
if(!mIsTuned) {
|
||||
if(mUseRRLayout) {
|
||||
typename GemmBatchedTensor_F16_F16_Linear_AlignTensor_Row_Row_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInput_F16 *)mTempMatA, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
|
@ -689,7 +693,7 @@ void MatMulExecution::setArguments(const std::vector<Tensor *> &inputs, const st
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
if(mUseRRLayout) {
|
||||
if(mNeedConvertMatAB) {
|
||||
|
@ -1240,8 +1244,11 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
|
||||
} else {
|
||||
#ifdef ENABLE_CUDA_TUNE_PARAM
|
||||
if(mIsTuned) {
|
||||
runGemmBatchedTensorCoreFloat16Infer(&mInfo);
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
if(!mIsTuned) {
|
||||
if(mUseRRLayout) {
|
||||
cutlass::Status status = mGemmBatchedF16F16LnAlign8RRSm75();
|
||||
cutlass_check(status);
|
||||
|
@ -1264,7 +1271,7 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
// printf("normal:%d rrlayout:%d convertab:%d halign:%d\n", mFp16Fp32MixInfer, mUseRRLayout, mNeedConvertMatAB, hAlignment);
|
||||
return NO_ERROR;
|
||||
|
|
|
@ -84,6 +84,7 @@ private:
|
|||
CutlassGemmInfo mGemmInfo;
|
||||
int mBatch = 1;
|
||||
int mGpuComputeCap;
|
||||
bool mIsTuned = false;
|
||||
bool mFp16Infer = false;
|
||||
bool mFp32Infer = false;
|
||||
bool mFp16Fp32MixInfer = false;
|
||||
|
|
|
@ -1083,9 +1083,17 @@ void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, co
|
|||
BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
|
||||
} else if (type.bits == 16) {
|
||||
BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
|
||||
} else {
|
||||
MNN_ERROR("CUDA not supoort data code:%d, data bits:%d\n", type.code, type.bits);
|
||||
}
|
||||
} else if (type.code == halide_type_int) {
|
||||
if(type.bits == 32) {
|
||||
BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
|
||||
} else {
|
||||
MNN_ERROR("CUDA not supoort data code:%d, data bits:%d\n", type.code, type.bits);
|
||||
}
|
||||
} else {
|
||||
MNN_ERROR("CUDA not supoort data code:%d, data bits:%d\n", type.code, type.bits);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -110,8 +110,11 @@ ErrorCode CutlassConvCommonExecution::runCutlassGemmFunc() {
|
|||
}
|
||||
|
||||
#ifdef ENABLE_CUDA_TUNE_PARAM
|
||||
if(mIsTuned) {
|
||||
runGemmTensorCoreFloat16Infer(&mInfo);
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
if(!mIsTuned) {
|
||||
if(mActivationType == 1) {
|
||||
if(mFp16Fp32MixInfer) {
|
||||
cutlass::Status status = mGemmF16F32ReluSm75();
|
||||
|
@ -137,7 +140,7 @@ ErrorCode CutlassConvCommonExecution::runCutlassGemmFunc() {
|
|||
cutlass_check(status);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -94,6 +94,7 @@ protected:
|
|||
GemmTensor_BF16_BF16_Relu6_AlignTensor_Sm80 mGemmBF16BF16Relu6Sm80;
|
||||
#endif
|
||||
int mGpuComputeCap = 75;
|
||||
bool mIsTuned = false;
|
||||
int mActivationType = 0;
|
||||
bool mFp16Infer = false;
|
||||
bool mFp32Infer = false;
|
||||
|
|
|
@ -871,6 +871,17 @@ const char* shader_MetalBackend_metal =
|
|||
" uint4 extent;//dstStride[3]+dstOffset\n"
|
||||
" uint4 imageSize;\n"
|
||||
"};\n"
|
||||
"struct MemsetInfo {\n"
|
||||
" int4 V;\n"
|
||||
" uint4 size;\n"
|
||||
"};\n"
|
||||
"kernel void fill_intx4(device int4 *out [[buffer(0)]],\n"
|
||||
" constant MemsetInfo &info [[buffer(1)]],\n"
|
||||
" uint3 gid [[thread_position_in_grid]]) {\n"
|
||||
" if (gid.x<info.size.x) {\n"
|
||||
" out[gid.x]=info.V;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"kernel void blit_intx4(const device int4 *in [[buffer(0)]],\n"
|
||||
" device int4 *out [[buffer(1)]],\n"
|
||||
" constant SamplerInfo &info [[buffer(2)]],\n"
|
||||
|
@ -1776,34 +1787,6 @@ const char* shader_MetalROIPooling_metal =
|
|||
" out[int(gid.z)*s.output_size+int(gid.y)*s.output_width+int(gid.x)]=max4;\n"
|
||||
"}\n"
|
||||
;
|
||||
const char* shader_MetalCast_metal =
|
||||
"using namespace metal;\n"
|
||||
"kernel void cast_float_to_int32(const device M *in [[buffer(0)]],\n"
|
||||
" device int *out [[buffer(1)]],\n"
|
||||
" uint gid [[thread_position_in_grid]]) {\n"
|
||||
" out[int(gid)]=int(in[int(gid)]);\n"
|
||||
"}\n"
|
||||
"kernel void cast_int32_to_float(const device int *in [[buffer(0)]],\n"
|
||||
" device M *out [[buffer(1)]],\n"
|
||||
" uint gid [[thread_position_in_grid]]) {\n"
|
||||
" out[int(gid)]=M(in[int(gid)]);\n"
|
||||
"}\n"
|
||||
"kernel void cast_uint8_to_float(const device uchar *in [[buffer(0)]],\n"
|
||||
" device M *out [[buffer(1)]],\n"
|
||||
" uint gid [[thread_position_in_grid]]) {\n"
|
||||
" out[int(gid)]=M(in[int(gid)]);\n"
|
||||
"}\n"
|
||||
"kernel void cast_uint8_to_int(const device uchar *in [[buffer(0)]],\n"
|
||||
" device int *out [[buffer(1)]],\n"
|
||||
" uint gid [[thread_position_in_grid]]) {\n"
|
||||
" out[int(gid)]=M(in[int(gid)]);\n"
|
||||
"}\n"
|
||||
"kernel void cast_float_to_uint8(const device M *in [[buffer(0)]],\n"
|
||||
" device uchar *out [[buffer(1)]],\n"
|
||||
" uint gid [[thread_position_in_grid]]) {\n"
|
||||
" out[int(gid)]=uchar(in[int(gid)]);\n"
|
||||
"}\n"
|
||||
;
|
||||
const char* shader_MetalConvolution1x1_metal =
|
||||
"#define CONV_UNROLL (4)\n"
|
||||
"#define CONV_UNROLL_L (8)\n"
|
||||
|
@ -2200,68 +2183,6 @@ const char* shader_MetalConvolution1x1_metal =
|
|||
"}\n"
|
||||
;
|
||||
const char* shader_MetalConvolutionGEMM_metal =
|
||||
"struct conv_im2col_cst {\n"
|
||||
" int input_width;\n"
|
||||
" int input_height;\n"
|
||||
" int input_size;\n"
|
||||
" int input_slice;\n"
|
||||
" int output_width;\n"
|
||||
" int output_height;\n"
|
||||
" int output_size;\n"
|
||||
" int output_slice;\n"
|
||||
" int batch;\n"
|
||||
" \n"
|
||||
" int kernel_x;\n"
|
||||
" int kernel_y;\n"
|
||||
" int kernel_size;\n"
|
||||
" int stride_x;\n"
|
||||
" int stride_y;\n"
|
||||
" int pad_x;\n"
|
||||
" int pad_y;\n"
|
||||
" int dilation_x;\n"
|
||||
" int dilation_y;\n"
|
||||
" conv_activation_type activation;\n"
|
||||
"};\n"
|
||||
"kernel void conv_im2col(const device M4 *im [[buffer(0)]],\n"
|
||||
" device M4 *cols [[buffer(1)]],\n"
|
||||
" constant conv_im2col_cst& cst [[buffer(2)]],\n"
|
||||
" uint3 gid [[thread_position_in_grid]]) {\n"
|
||||
" auto z=gid.z % cst.input_slice;\n"
|
||||
" auto b=gid.z/cst.input_slice;\n"
|
||||
" if ((int)gid.x<cst.output_width && (int)gid.y<cst.output_height && (int)b<cst.batch) {\n"
|
||||
" int offset_x=gid.x*cst.stride_x-cst.pad_x;\n"
|
||||
" int offset_y=gid.y*cst.stride_y-cst.pad_y;\n"
|
||||
" int index=b*cst.output_size+gid.y*cst.output_width+gid.x;\n"
|
||||
" int cols_y=index/4;\n"
|
||||
" int cols_x=index % 4+z*cst.kernel_size*4;\n"
|
||||
" \n"
|
||||
" auto xy_cols=cols+cols_y*cst.kernel_size*cst.input_slice*4+cols_x;\n"
|
||||
" auto xy_im=im+b*cst.input_size*cst.input_slice+z*cst.input_size;\n"
|
||||
" for (int ky=0,src_y=offset_y; ky<cst.kernel_y; ky++,src_y += cst.dilation_y) {\n"
|
||||
" for (int kx=0,src_x=offset_x; kx<cst.kernel_x; kx++,src_x += cst.dilation_x) {\n"
|
||||
" auto pad=src_x<0 || src_y<0 || src_x >= cst.input_width || src_y >= cst.input_height;\n"
|
||||
" xy_cols[(ky*cst.kernel_x+kx)*4]=pad ? 0 : xy_im[src_y*cst.input_width+src_x];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"kernel void conv_col2im(const device M4 *cols [[buffer(0)]],\n"
|
||||
" device M4 *im [[buffer(1)]],\n"
|
||||
" const device M4 *biasTerms [[buffer(2)]],\n"
|
||||
" constant conv_im2col_cst& cst [[buffer(3)]],\n"
|
||||
" uint3 gid [[thread_position_in_grid]]) {\n"
|
||||
" auto z=gid.z % cst.output_slice;\n"
|
||||
" auto b=gid.z/cst.output_slice;\n"
|
||||
" if ((int)gid.x<cst.output_width && (int)gid.y<cst.output_height && (int)b<cst.batch) {\n"
|
||||
" int index=b*cst.output_size+gid.y*cst.output_width+gid.x;\n"
|
||||
" auto src_x=index/4;\n"
|
||||
" auto src_y=index % 4+z*4;\n"
|
||||
" auto src_y_stride=UP_DIV(cst.output_size*cst.batch,4);\n"
|
||||
" \n"
|
||||
" auto v=cols[(int)src_y*src_y_stride+(int)src_x]+biasTerms[(int)z];\n"
|
||||
" im[(int)gid.z*cst.output_size+(int)gid.y*cst.output_width+(int)gid.x]=activate(v,cst.activation);\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"struct matmul4x4_const {\n"
|
||||
" int output_width;\n"
|
||||
" int output_height;\n"
|
||||
|
@ -2428,8 +2349,6 @@ const char* shader_MetalDefine_metal =
|
|||
"// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
|
||||
"#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
|
||||
"#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
|
||||
"// whether store with float32\n"
|
||||
"#define MNN_METAL_FULL_PRECISION 0 // should edit in .h too\n"
|
||||
"// whether computer with float32 when store with float16\n"
|
||||
"#define MNN_METAL_FLOAT32_COMPUTER 1 //\n"
|
||||
"#if MNN_METAL_FULL_PRECISION\n"
|
||||
|
|
|
@ -16,7 +16,6 @@ extern const char* shader_MetalScale_metal;
|
|||
extern const char* shader_MetalDeconvolution_metal;
|
||||
extern const char* shader_MetalPooling_metal;
|
||||
extern const char* shader_MetalROIPooling_metal;
|
||||
extern const char* shader_MetalCast_metal;
|
||||
extern const char* shader_MetalConvolution1x1_metal;
|
||||
extern const char* shader_MetalConvolutionGEMM_metal;
|
||||
extern const char* shader_MetalResize_metal;
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
FILE(GLOB MNN_Metal_SRC ${CMAKE_CURRENT_LIST_DIR}/*.mm ${CMAKE_CURRENT_LIST_DIR}/*.hpp ${CMAKE_CURRENT_LIST_DIR}/*.h ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
|
||||
IF(MNN_SUPPORT_RENDER)
|
||||
file(GLOB MNN_Metal_Render_SRC ${CMAKE_CURRENT_LIST_DIR}/render/*.mm ${CMAKE_CURRENT_LIST_DIR}/render/*.hpp ${CMAKE_CURRENT_LIST_DIR}/render/*.cpp)
|
||||
list(APPEND MNN_Metal_SRC ${MNN_Metal_Render_SRC})
|
||||
ENDIF()
|
||||
FILE(GLOB MNN_Metal_KERNELS_SRC ${CMAKE_CURRENT_LIST_DIR}/*.metal)
|
||||
option(MNN_METALLIB_SOURCE "Use Metal Source Directly" ON)
|
||||
add_library(MNNMetal OBJECT ${MNN_Metal_SRC} "${CMAKE_CURRENT_LIST_DIR}/MetalOPRegister.mm")
|
||||
|
|
|
@ -42,6 +42,7 @@ typedef struct {
|
|||
@property (strong, nonatomic, readonly) id<MTLDevice> device;
|
||||
/** max memory length cound be used in threadgroup */
|
||||
@property (assign, nonatomic, readonly) BOOL isCommitEachShader;
|
||||
@property (assign, nonatomic, readonly) BOOL isIphone;
|
||||
|
||||
/**
|
||||
* @brief alloc temp buffer on device
|
||||
|
@ -60,19 +61,6 @@ typedef struct {
|
|||
*/
|
||||
- (id<MTLBuffer>)newDeviceBuffer:(NSUInteger)size bytes:(const void *)bytes access:(MNN::MetalAccess)access;
|
||||
|
||||
/**
|
||||
* @brief create compute encoder on default command buffer
|
||||
* @return created encoder
|
||||
*/
|
||||
- (id<MTLComputeCommandEncoder>)encoder;
|
||||
- (id<MTLComputeCommandEncoder>)encoder_net;
|
||||
|
||||
/**
|
||||
* @brief create fill encoder on default command buffer
|
||||
* @return created encoder
|
||||
*/
|
||||
- (id<MTLBlitCommandEncoder>)encoderBlit;
|
||||
- (id<MTLBlitCommandEncoder>)encoderBlit_net;
|
||||
|
||||
/**
|
||||
* @brief load encoder with function name. returns maxTotalThreadsPerThreadgroup of pipeline.
|
||||
|
@ -80,7 +68,7 @@ typedef struct {
|
|||
* @param encoder command encoder
|
||||
* @return bandwidth info for function
|
||||
*/
|
||||
- (MNN::MetalBandwidth)load:(NSString *)name encoder:(id<MTLComputeCommandEncoder>)encoder;
|
||||
- (MNN::MetalBandwidth)load:(NSString *)name encoder:(id<MTLComputeCommandEncoder>)encoder fp16:(BOOL)fp16;
|
||||
|
||||
/**
|
||||
* @brief load encoder with function name. returns maxTotalThreadsPerThreadgroup of pipeline.
|
||||
|
@ -88,22 +76,15 @@ typedef struct {
|
|||
* @param encoder command encoder
|
||||
* @return bandwidth info for function
|
||||
*/
|
||||
- (id<MTLCommandBuffer>) newCmdBuffer:(MTLSize) localIndex;
|
||||
- (id<MTLCommandBuffer>) newCmdBuffer:(MTLSize) localIndex queue:(id<MTLCommandQueue>) cmdqueue;
|
||||
|
||||
- (NSUInteger)timeUsed:(id<MTLCommandBuffer>) buffer;
|
||||
|
||||
- (std::tuple<MTLSize, MTLSize, NSUInteger>) getGridAndThreadgroup: (id<MTLComputePipelineState>)pipeline gid:(MTLSize)threads loop:(NSUInteger)count buffer:(NSArray *)buffers runtime:(MNN::MetalRuntime *) rt shaderName:(std::string) kernelName;
|
||||
- (std::tuple<MTLSize, MTLSize, NSUInteger>) getGridAndThreadgroup: (id<MTLComputePipelineState>)pipeline gid:(MTLSize)threads loop:(NSUInteger)count buffer:(NSArray *)buffers runtime:(MNN::MetalRuntime *) rt shaderName:(std::string) kernelName queue:(id<MTLCommandQueue>) cmdqueue;
|
||||
- (NSUInteger)PipelinetimeUsed: (id<MTLComputePipelineState>)pipeline global:(MTLSize)globals local:(MTLSize)locals loop:(NSUInteger)count buffer:(NSArray *)buffers queue:(id<MTLCommandQueue>) cmdqueue;
|
||||
|
||||
|
||||
- (BOOL) initWithSharedContext:(const MNNMetalSharedContext*)context dev:(id<MTLDevice>)device;
|
||||
/**
|
||||
* @brief commit commands
|
||||
*/
|
||||
- (void)commit;
|
||||
- (void)commit_net;
|
||||
/**
|
||||
* @brief wait for completion
|
||||
*/
|
||||
- (void)wait;
|
||||
|
||||
/**
|
||||
* @brief dispatch encoder with default settings
|
||||
|
@ -126,8 +107,8 @@ typedef struct {
|
|||
threads:(MTLSize)threads
|
||||
threadsPerGroup:(MTLSize)threadsPerGroup
|
||||
bandwidth:(MNN::MetalBandwidth)bandwidth;
|
||||
- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name;
|
||||
- (id<MTLComputePipelineState>)pipelineWithSource:(NSString *)source name:(NSString *)name;
|
||||
- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name fp16:(BOOL)fp16;
|
||||
- (id<MTLComputePipelineState>)pipelineWithSourceOption:(NSString *)source name:(NSString *)name options:(MTLCompileOptions *)options;
|
||||
- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) pipeline threads:(MTLSize)threads;
|
||||
|
||||
- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t;
|
||||
|
|
|
@ -22,18 +22,15 @@ using namespace MNN;
|
|||
@interface MNNMetalContext ()
|
||||
// public
|
||||
@property (strong, nonatomic) id<MTLDevice> device;
|
||||
@property (strong, nonatomic) id<MTLCommandQueue> commandQueue;
|
||||
@property (strong, nonatomic) id<MTLCommandBuffer> commandBuffer;
|
||||
@property (strong, nonatomic) id<MTLCommandBuffer> commandBuffer_net;
|
||||
@property (assign, nonatomic) BOOL isIphone;
|
||||
// private
|
||||
@property (strong, nonatomic) NSMutableDictionary<NSString *, id<MTLComputePipelineState>> *caches;
|
||||
@property (strong, nonatomic) NSMutableArray<id<MTLCommandBuffer>> *waitings;
|
||||
@property (strong, nonatomic) NSMutableDictionary<NSString *, id<MTLLibrary>>* library;
|
||||
@property (strong, nonatomic) NSMutableDictionary<NSString *, id<MTLComputePipelineState>> *cachesFp32;
|
||||
@property (strong, nonatomic) NSMutableDictionary<NSString *, id<MTLComputePipelineState>> *cachesFp16;
|
||||
@end
|
||||
|
||||
@implementation MNNMetalContext
|
||||
|
||||
static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *, id<MTLLibrary>>* libraryMap) {
|
||||
static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *, id<MTLComputePipelineState>>* libraryMap, bool usefp16) {
|
||||
AUTOTIME;
|
||||
ShaderMap shader;
|
||||
auto first = shader.search("shader_MetalDefine_metal");
|
||||
|
@ -47,6 +44,11 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
if (iter.first == "shader_MetalConvolutionActivation_metal") {
|
||||
continue;
|
||||
}
|
||||
if (!usefp16) {
|
||||
total << "#define MNN_METAL_FULL_PRECISION 1\n";
|
||||
} else {
|
||||
total << "#define MNN_METAL_FULL_PRECISION 0\n";
|
||||
}
|
||||
total << first << "\n" << second << "\n" << iter.second;
|
||||
auto totalString = total.str();
|
||||
auto totalNSString = [[NSString alloc] initWithUTF8String:totalString.c_str()];
|
||||
|
@ -64,7 +66,15 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
}
|
||||
auto functionNames = [library functionNames];
|
||||
for(int i=0; i<functionNames.count ; i++) {
|
||||
libraryMap[functionNames[i]] = library;
|
||||
id<MTLFunction> function = [library newFunctionWithName:functionNames[i]];
|
||||
if (!function) {
|
||||
MNN_ERROR("Create Function in metal error\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
NSError *error = nil;
|
||||
auto result = [device newComputePipelineStateWithFunction:function error:&error];
|
||||
libraryMap[functionNames[i]] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -96,19 +106,29 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
return NO;
|
||||
}
|
||||
|
||||
+ (BOOL)isIphone{
|
||||
struct utsname systemInfo;
|
||||
uname(&systemInfo);
|
||||
NSString *deviceString = [NSString stringWithCString:systemInfo.machine encoding:NSASCIIStringEncoding];
|
||||
NSString *subString = @"iPhone";
|
||||
NSRange range = [deviceString rangeOfString:subString];
|
||||
if (range.location != NSNotFound) {
|
||||
return YES;
|
||||
}
|
||||
return NO;
|
||||
}
|
||||
|
||||
|
||||
- (BOOL) initWithSharedContext:(const MNNMetalSharedContext*)context dev:(id<MTLDevice>)device {
|
||||
MNN_ASSERT(nullptr != context);
|
||||
_device = context->device;
|
||||
_library = [NSMutableDictionary dictionary];
|
||||
createLibrary(_device, _library);
|
||||
_commandQueue = context->queue;
|
||||
_commandBuffer = [_commandQueue commandBuffer];
|
||||
_commandBuffer_net = [_commandQueue commandBuffer];
|
||||
_caches = [NSMutableDictionary dictionary];
|
||||
_waitings = [NSMutableArray array];
|
||||
_cachesFp16 = [NSMutableDictionary dictionary];
|
||||
_cachesFp32 = [NSMutableDictionary dictionary];
|
||||
_isCommitEachShader = self.class.commit_frequent;
|
||||
|
||||
return (0 != [_library count]);
|
||||
_isIphone = self.class.isIphone;
|
||||
createLibrary(_device, _cachesFp16, true);
|
||||
createLibrary(_device, _cachesFp32, false);
|
||||
return nil != _device;
|
||||
}
|
||||
|
||||
- (instancetype)init {
|
||||
|
@ -139,42 +159,16 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
return [_device newBufferWithBytes:bytes length:size options:[self optionForAccess:access]];
|
||||
}
|
||||
|
||||
#pragma mark enqueue
|
||||
- (id<MTLFunction>)functionWithName:(NSString *)name {
|
||||
if (!name)
|
||||
return nil;
|
||||
auto lib = _library[name];
|
||||
id<MTLFunction> result = [lib newFunctionWithName:name];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
if (@available(iOS 10.0, *))
|
||||
result.label = name;
|
||||
#endif
|
||||
return result;
|
||||
- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name fp16:(BOOL)fp16 {
|
||||
if (fp16) {
|
||||
return _cachesFp16[name];
|
||||
}
|
||||
return _cachesFp32[name];
|
||||
}
|
||||
|
||||
- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name {
|
||||
id<MTLComputePipelineState> result = _caches[name];
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
id<MTLFunction> function = [self functionWithName:name];
|
||||
if (!function)
|
||||
return nil;
|
||||
|
||||
NSError *error = nil;
|
||||
result = [_device newComputePipelineStateWithFunction:function error:&error];
|
||||
#if MNN_METAL_DEBUG
|
||||
if (error)
|
||||
printf("[METAL] create pipeline error: %s\n", error.localizedDescription.UTF8String);
|
||||
#endif
|
||||
if (result)
|
||||
_caches[name] = result;
|
||||
return result;
|
||||
}
|
||||
|
||||
- (id<MTLComputePipelineState>)pipelineWithSource:(NSString *)source name:(NSString *)name {
|
||||
- (id<MTLComputePipelineState>)pipelineWithSourceOption:(NSString *)source name:(NSString *)name options:(MTLCompileOptions *)options {
|
||||
NSError *err = nil;
|
||||
auto library = [_device newLibraryWithSource:source options:nil error:&err];
|
||||
auto library = [_device newLibraryWithSource:source options:options error:&err];
|
||||
if (nil == library) {
|
||||
if (err) {
|
||||
NSLog(@"Warning: pipelineWithSource error: %@", err);
|
||||
|
@ -184,43 +178,11 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
id<MTLFunction> function = [library newFunctionWithName:name];
|
||||
NSError *error = nil;
|
||||
id<MTLComputePipelineState> result = [_device newComputePipelineStateWithFunction:function error:&error];
|
||||
if (result)
|
||||
_caches[name] = result;
|
||||
return result;
|
||||
}
|
||||
|
||||
- (id<MTLComputeCommandEncoder>)encoder {
|
||||
id<MTLComputeCommandEncoder> result = [_commandBuffer computeCommandEncoder];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
result.label = nil;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
- (id<MTLBlitCommandEncoder>)encoderBlit {
|
||||
id<MTLBlitCommandEncoder> result = [_commandBuffer blitCommandEncoder];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
result.label = nil;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
- (id<MTLComputeCommandEncoder>)encoder_net {
|
||||
id<MTLComputeCommandEncoder> result = [_commandBuffer_net computeCommandEncoder];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
result.label = nil;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
- (id<MTLBlitCommandEncoder>)encoderBlit_net {
|
||||
id<MTLBlitCommandEncoder> result = [_commandBuffer_net blitCommandEncoder];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
result.label = nil;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
- (MetalBandwidth)load:(NSString *)name encoder:(id<MTLComputeCommandEncoder>)encoder {
|
||||
id<MTLComputePipelineState> pipeline = [self pipelineWithName:name];
|
||||
- (MetalBandwidth)load:(NSString *)name encoder:(id<MTLComputeCommandEncoder>)encoder fp16:(BOOL)fp16 {
|
||||
id<MTLComputePipelineState> pipeline = [self pipelineWithName:name fp16:fp16];
|
||||
MNN_ASSERT(nil != pipeline);
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
|
@ -238,13 +200,6 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
return {pipeline.threadExecutionWidth, pipeline.maxTotalThreadsPerThreadgroup, NO};
|
||||
}
|
||||
|
||||
- (id<MTLCommandBuffer>) newCmdBuffer:(MTLSize) localIndex {
|
||||
id<MTLCommandBuffer> cmdBuffer = [_commandQueue commandBuffer]; // create a new command buffer
|
||||
std::string label = std::to_string((int)localIndex.width) + "_" + std::to_string((int)localIndex.height) + "_" + std::to_string((int)localIndex.depth);
|
||||
cmdBuffer.label = [NSString stringWithCString:label.c_str() encoding:[NSString defaultCStringEncoding]];
|
||||
return cmdBuffer;
|
||||
}
|
||||
|
||||
- (NSUInteger)timeUsed:(id<MTLCommandBuffer>)buffer {
|
||||
// Get ns precision time
|
||||
auto start = mach_absolute_time();
|
||||
|
@ -256,8 +211,14 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
return (end-start)/1000;
|
||||
}
|
||||
|
||||
- (id<MTLCommandBuffer>) newCmdBuffer:(MTLSize) localIndex queue:(id<MTLCommandQueue>) cmdqueue {
|
||||
id<MTLCommandBuffer> cmdBuffer = [cmdqueue commandBuffer]; // create a new command buffer
|
||||
std::string label = std::to_string((int)localIndex.width) + "_" + std::to_string((int)localIndex.height) + "_" + std::to_string((int)localIndex.depth);
|
||||
cmdBuffer.label = [NSString stringWithCString:label.c_str() encoding:[NSString defaultCStringEncoding]];
|
||||
return cmdBuffer;
|
||||
}
|
||||
|
||||
- (std::tuple<MTLSize, MTLSize, NSUInteger>) getGridAndThreadgroup: (id<MTLComputePipelineState>)pipeline gid:(MTLSize)threads loop:(NSUInteger)count buffer:(NSArray *)buffers runtime:(MetalRuntime *) rt shaderName:(std::string) kernelName {
|
||||
- (std::tuple<MTLSize, MTLSize, NSUInteger>) getGridAndThreadgroup: (id<MTLComputePipelineState>)pipeline gid:(MTLSize)threads loop:(NSUInteger)count buffer:(NSArray *)buffers runtime:(MetalRuntime *) rt shaderName:(std::string) kernelName queue:(id<MTLCommandQueue>) cmdqueue {
|
||||
NSUInteger gid_x = threads.width;
|
||||
NSUInteger gid_y = threads.height;
|
||||
NSUInteger gid_z = threads.depth;
|
||||
|
@ -289,7 +250,7 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
{
|
||||
//get original trick time
|
||||
{
|
||||
id<MTLCommandBuffer> commamd_buffer = [self newCmdBuffer:thread.second];
|
||||
id<MTLCommandBuffer> commamd_buffer = [self newCmdBuffer:thread.second queue:cmdqueue];
|
||||
id<MTLComputeCommandEncoder> encoder = [commamd_buffer computeCommandEncoder];
|
||||
|
||||
int loop = count;
|
||||
|
@ -344,7 +305,7 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
}
|
||||
MTLSize local = {x, y, z};
|
||||
MTLSize global = {UP_DIV(gid_x, x), UP_DIV(gid_y, y), UP_DIV(gid_z, z)};
|
||||
id<MTLCommandBuffer> commamd_buffer = [self newCmdBuffer:local];
|
||||
id<MTLCommandBuffer> commamd_buffer = [self newCmdBuffer:local queue:cmdqueue];
|
||||
id<MTLComputeCommandEncoder> encoder = [commamd_buffer computeCommandEncoder];
|
||||
|
||||
int loop = count;
|
||||
|
@ -388,50 +349,27 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
return std::make_tuple(thread.first, thread.second, min_time);
|
||||
}
|
||||
|
||||
#pragma mark dispatch
|
||||
- (void)commit {
|
||||
if (_commandBuffer.status < MTLCommandBufferStatusCommitted) {
|
||||
[_commandBuffer commit];
|
||||
[_waitings addObject:_commandBuffer];
|
||||
_commandBuffer = [_commandQueue commandBuffer]; // create a new command buffer
|
||||
}
|
||||
|
||||
- (NSUInteger)PipelinetimeUsed: (id<MTLComputePipelineState>)pipeline global:(MTLSize)globals local:(MTLSize)locals loop:(NSUInteger)count buffer:(NSArray *)buffers queue:(id<MTLCommandQueue>) cmdqueue{
|
||||
NSUInteger time = 0;
|
||||
MTLSize local_size = {locals.width, locals.height, locals.depth};
|
||||
MTLSize global_size = {globals.width, globals.height, globals.depth};
|
||||
id<MTLCommandBuffer> commamd_buffer = [self newCmdBuffer:local_size queue:cmdqueue];
|
||||
id<MTLComputeCommandEncoder> encoder = [commamd_buffer computeCommandEncoder];
|
||||
|
||||
int loop = count;
|
||||
while(loop--) {
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
for(NSUInteger idx = 0; idx < buffers.count; idx++) {
|
||||
[encoder setBuffer:[buffers objectAtIndex:idx] offset:0 atIndex:idx];
|
||||
}
|
||||
|
||||
- (void)commit_net {
|
||||
if (_commandBuffer_net.status < MTLCommandBufferStatusCommitted) {
|
||||
[_commandBuffer_net commit];
|
||||
[_waitings addObject:_commandBuffer_net];
|
||||
_commandBuffer_net = [_commandQueue commandBuffer]; // create a new command buffer
|
||||
}
|
||||
[encoder dispatchThreadgroups:global_size threadsPerThreadgroup:local_size];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
time = [self timeUsed :commamd_buffer];
|
||||
|
||||
- (void)wait {
|
||||
for (id<MTLCommandBuffer> buffer in _waitings) {
|
||||
if (buffer.status >= MTLCommandBufferStatusCompleted)
|
||||
continue;
|
||||
|
||||
#if MNN_METAL_BENCHMARK
|
||||
NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
|
||||
[buffer waitUntilCompleted];
|
||||
NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
|
||||
if (@available(iOS 10.3, *)) {
|
||||
printf("[METAL] commit costs: %.3fms\t(kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
|
||||
(buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
|
||||
(buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
|
||||
} else {
|
||||
printf("[METAL] commit costs: %.3fms\n", (end - begin) * 1000.f);
|
||||
}
|
||||
#else
|
||||
[buffer waitUntilCompleted];
|
||||
#endif
|
||||
|
||||
#if MNN_METAL_DEBUG
|
||||
if (buffer.error) {
|
||||
printf("[METAL] %s\n", buffer.error.localizedDescription.UTF8String);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
[_waitings removeAllObjects];
|
||||
return time;
|
||||
}
|
||||
|
||||
static NSUInteger smallest_log2(NSUInteger integer) {
|
||||
|
@ -663,7 +601,7 @@ void printBuffer(const void *content, unsigned long bytes, const char *fmt) {
|
|||
}
|
||||
} else if (type == halide_type_float) {
|
||||
if (bits == 16) { // half
|
||||
printBuffer<metal_float>(bytes, length, "%.4f");
|
||||
printBuffer<__fp16>(bytes, length, "%.4f");
|
||||
} else { // float
|
||||
printBuffer<float>(bytes, length, "%.4f");
|
||||
}
|
||||
|
|
|
@ -37,6 +37,10 @@ public:
|
|||
}
|
||||
|
||||
void setGpuMode(const int cl_mode_num);
|
||||
void setCommandQueue(id<MTLCommandQueue> queue);
|
||||
id<MTLCommandQueue> getCommandQueue() const {
|
||||
return mQueue;
|
||||
}
|
||||
|
||||
std::pair<const void*, size_t> makeCache(TunedInfo* info);
|
||||
bool setCache(std::pair<const void*, size_t> cache);
|
||||
|
@ -70,10 +74,12 @@ private:
|
|||
std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;
|
||||
|
||||
private:
|
||||
id<MTLCommandQueue> mQueue = nil;
|
||||
std::vector<uint8_t> mBuffer;
|
||||
const void* mCacheOutside = nullptr;
|
||||
size_t mCacheOutsideSize = 0;
|
||||
TunedInfo* mTunedInfo;
|
||||
BackendConfig mDefaultConfig;
|
||||
};
|
||||
|
||||
|
||||
|
@ -124,11 +130,13 @@ public:
|
|||
* @param creator registering creator.
|
||||
*/
|
||||
static void addCreator(OpType type, Creator *creator);
|
||||
size_t getTensorSizeInBytes(const Tensor* tensor) const;
|
||||
|
||||
id<MTLBuffer> getHostBuffer(size_t size) const;
|
||||
id<MTLBuffer> getConstBuffer(size_t size) const;
|
||||
id<MTLComputePipelineState> makeComputePipelineWithSourceOption(const char* csource, const char* cname, MTLCompileOptions *options) const;
|
||||
public:
|
||||
MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime);
|
||||
MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime, bool usefp16AsFp32);
|
||||
virtual ~MetalBackend();
|
||||
const MetalRuntime* runtime() const {
|
||||
return mRuntime;
|
||||
|
@ -146,6 +154,7 @@ public:
|
|||
virtual void onExecuteBegin() const override;
|
||||
virtual void onExecuteEnd() const override;
|
||||
virtual int onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) override;
|
||||
virtual bool onGetTensorInfo(const Tensor* tensor, void* dstInfo) override;
|
||||
|
||||
public:
|
||||
/**
|
||||
|
@ -164,7 +173,7 @@ public:
|
|||
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
|
||||
|
||||
void flushEncoder() const;
|
||||
id<MTLComputeCommandEncoder> encoder() const;
|
||||
id<MTLComputeCommandEncoder> encoder_for_net() const;
|
||||
void addOpEncoder(std::function<void(void)> opEncoder);
|
||||
|
||||
bool isCommandEncoderSet();
|
||||
|
@ -178,15 +187,36 @@ public:
|
|||
}
|
||||
|
||||
bool isCmdBufferCommit();
|
||||
bool isIphone(){
|
||||
return mIsIphone;
|
||||
}
|
||||
|
||||
void commit() const;
|
||||
void commit_net() const;
|
||||
void wait() const;
|
||||
id<MTLCommandQueue> queue() const {
|
||||
return _commandQueue;
|
||||
}
|
||||
bool useFp16InsteadFp32() const {
|
||||
return mUseFloatAsFp16;
|
||||
}
|
||||
private:
|
||||
id<MTLCommandBuffer> getCommandBufferForBufferCopy() const;
|
||||
id<MTLCommandBuffer> getCommandBufferForNet() const;
|
||||
id<MTLComputeCommandEncoder> encoder_net() const;
|
||||
mutable id<MTLCommandBuffer> _commandBuffer = nil;
|
||||
mutable id<MTLCommandBuffer> _commandBuffer_net = nil;
|
||||
mutable id<MTLCommandBuffer> _waiting = nil;
|
||||
|
||||
id<MTLCommandQueue> _commandQueue;
|
||||
|
||||
const MetalRuntime* mRuntime;
|
||||
std::vector<id<MTLBuffer>> mHoldBuffers;
|
||||
id<MTLBuffer> mShapeH2D;
|
||||
id<MTLBuffer> mShapeD2H;
|
||||
mutable NSUInteger mEncoderCount = 0;
|
||||
mutable bool mOpEncoderSet = false;//whether has set encoder
|
||||
mutable bool mOpFullSupport = true;
|
||||
mutable bool mSupportDeferEncode = true;
|
||||
mutable bool mFrameEncodeCache = false;
|
||||
|
||||
std::vector<std::function<void(void)>> mOpEncoders;
|
||||
|
@ -199,6 +229,8 @@ private:
|
|||
void onCopyHostToDevice(const Tensor *src, const Tensor *dst) const;
|
||||
void onCopyDeviceToHost(const Tensor *src, const Tensor *dst) const;
|
||||
void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
|
||||
bool mUseFloatAsFp16;
|
||||
bool mIsIphone = false;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -34,6 +34,9 @@ struct TunedInfo {
|
|||
};
|
||||
|
||||
void registerMetalOps();
|
||||
#ifdef MNN_SUPPORT_RENDER
|
||||
extern void registerMetalRenderOps();
|
||||
#endif
|
||||
|
||||
static inline std::map<OpType, MetalBackend::Creator *> *getCreatorMap() {
|
||||
static std::once_flag of;
|
||||
|
@ -50,17 +53,40 @@ void MetalBackend::addCreator(OpType t, Creator *c) {
|
|||
map->insert(std::make_pair(t, c));
|
||||
}
|
||||
|
||||
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
|
||||
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime, bool usefp16AsFp32) : Backend(MNN_FORWARD_METAL) {
|
||||
mRuntime = runtime;
|
||||
mBufferPool.reset(new EagerBufferAllocator(EagerBufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
|
||||
mStaticBufferPool = staticMem;
|
||||
mShapeH2D = getConstBuffer(4 * sizeof(int));
|
||||
mShapeD2H = getConstBuffer(4 * sizeof(int));
|
||||
mOpFullSupport = true;
|
||||
mUseFloatAsFp16 = usefp16AsFp32;
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
mIsIphone = ctx.isIphone;
|
||||
if (runtime->getCommandQueue() == nil) {
|
||||
// one command queue can create only a few command buffer, so let each backend own a command queue
|
||||
_commandQueue = [[ctx device] newCommandQueue];
|
||||
mSupportDeferEncode = true;
|
||||
} else {
|
||||
// otherwise forbid defer encode optimize
|
||||
_commandQueue = runtime->getCommandQueue();
|
||||
mSupportDeferEncode = false;
|
||||
}
|
||||
_commandBuffer = nil;
|
||||
_commandBuffer_net = nil;
|
||||
_waiting = nil;
|
||||
}
|
||||
MetalBackend::~MetalBackend() {
|
||||
// Do nothing
|
||||
flushEncoder();
|
||||
}
|
||||
|
||||
id<MTLComputeCommandEncoder> MetalBackend::encoder_net() const {
|
||||
id<MTLComputeCommandEncoder> result = [getCommandBufferForNet() computeCommandEncoder];
|
||||
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
|
||||
result.label = nil;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
void *MetalBackend::context() const {
|
||||
return mRuntime->context();
|
||||
}
|
||||
|
@ -81,8 +107,7 @@ private:
|
|||
MemChunk mBuffer;
|
||||
EagerBufferAllocator* mAllocator;
|
||||
};
|
||||
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
|
||||
auto tensor = const_cast<Tensor *>(_tensor);
|
||||
size_t MetalBackend::getTensorSizeInBytes(const Tensor* tensor) const {
|
||||
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
|
||||
size_t size;
|
||||
if (MNN_DATA_FORMAT_NC4HW4 == format && tensor->dimensions() >= 2) {
|
||||
|
@ -107,16 +132,25 @@ Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType stor
|
|||
size = ROUND_UP(size, 4);
|
||||
}
|
||||
if (0 == size) {
|
||||
return nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// use metal_float when meets float
|
||||
if (halide_type_float == tensor->buffer().type.code && tensor->buffer().type.bits == 32) {
|
||||
size*= sizeof(metal_float);
|
||||
if (halide_type_float == tensor->buffer().type.code && tensor->buffer().type.bits == 32 && mUseFloatAsFp16) {
|
||||
size *= 2;
|
||||
} else {
|
||||
size *= tensor->getType().bytes();
|
||||
}
|
||||
size_t align = 4 * sizeof(int);
|
||||
size = ROUND_UP(size, align);
|
||||
return size;
|
||||
}
|
||||
|
||||
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
|
||||
auto tensor = const_cast<Tensor *>(_tensor);
|
||||
size_t size = getTensorSizeInBytes(_tensor);
|
||||
if (0 == size) {
|
||||
return nullptr;
|
||||
}
|
||||
// reuse if possible
|
||||
MemChunk buffer;
|
||||
EagerBufferAllocator* allocator = nullptr;
|
||||
|
@ -159,7 +193,7 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
|
|||
auto map = getCreatorMap();
|
||||
auto iter = map->find(op->type());
|
||||
if (iter == map->end()) {
|
||||
mOpFullSupport = false;
|
||||
mSupportDeferEncode = false;
|
||||
if (nullptr != op->name()) {
|
||||
MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||||
} else {
|
||||
|
@ -170,7 +204,7 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
|
|||
|
||||
auto exe = iter->second->onCreate(inputs, op, this, outputs);
|
||||
if (NULL == exe) {
|
||||
mOpFullSupport = false;
|
||||
mSupportDeferEncode = false;
|
||||
MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : "");
|
||||
return NULL;
|
||||
}
|
||||
|
@ -192,8 +226,7 @@ void MetalBackend::onExecuteBegin() const {
|
|||
}
|
||||
void MetalBackend::onExecuteEnd() const {
|
||||
flushEncoder();
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
[ctx commit_net];
|
||||
commit_net();
|
||||
|
||||
if(mFrameEncodeCache) {
|
||||
for(auto opEncoder : mOpEncoders) {
|
||||
|
@ -202,6 +235,20 @@ void MetalBackend::onExecuteEnd() const {
|
|||
setOpEncoder();
|
||||
}
|
||||
}
|
||||
bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
|
||||
if (nullptr == dstInfo) {
|
||||
return true;
|
||||
}
|
||||
auto dst = (MNNMetalTensorContent*)dstInfo;
|
||||
dst->type.code = halide_type_float;
|
||||
if (mUseFloatAsFp16) {
|
||||
dst->type.bits = 16;
|
||||
} else {
|
||||
dst->type.bits = 32;
|
||||
}
|
||||
MNNMetalGetTensorContent(dst, (void*)tensor);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MetalBackend::isCommandEncoderSet() {
|
||||
return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport
|
||||
|
@ -350,14 +397,13 @@ void MetalBackend::onResizeBegin() {
|
|||
|
||||
// Finish last inference task if needed
|
||||
flushEncoder();
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
[ctx commit_net];
|
||||
[ctx wait];
|
||||
commit_net();
|
||||
wait();
|
||||
}
|
||||
|
||||
ErrorCode MetalBackend::onResizeEnd() {
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
mFrameEncodeCache = (!ctx.isCommitEachShader && mOpFullSupport);
|
||||
mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
@ -368,13 +414,17 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *) (dst->deviceId()))->getBuffer();
|
||||
auto floats = src->getType().code == halide_type_float;
|
||||
|
||||
// For command queue from user, need user to make sure last frame's gpu work is ready
|
||||
bool needWait = mRuntime->getCommandQueue() == nil;
|
||||
// cast
|
||||
if (sfmt == dfmt || src->dimensions() <= 1) {
|
||||
if (floats) {
|
||||
if (floats && mUseFloatAsFp16) {
|
||||
NSUInteger size = src->elementSize();
|
||||
auto sizeC4 = UP_DIV(size, 4);
|
||||
auto host = this->getHostBuffer(sizeC4 * 4 * sizeof(float));
|
||||
[ctx wait];// make sure previous gpu task finished. for reuse mHostBuffer and mShapeH2D
|
||||
if (needWait) {
|
||||
wait();
|
||||
}
|
||||
memcpy(host.contents, src->host<float>(), src->size());
|
||||
unsigned int limits[] = {
|
||||
(unsigned int)sizeC4,
|
||||
|
@ -383,8 +433,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
1
|
||||
};
|
||||
::memcpy(mShapeH2D.contents, limits, sizeof(limits));
|
||||
auto encoder = [ctx encoder];
|
||||
auto bandwidth = [ctx load: @"downcast_float4" encoder:encoder];
|
||||
auto encoder = [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||||
auto bandwidth = [ctx load: @"downcast_float4" encoder:encoder fp16:mUseFloatAsFp16];
|
||||
|
||||
[encoder setBuffer:host offset:0 atIndex:0];
|
||||
[encoder setBuffer:device offset:TensorUtils::getDescribe(dst)->extra.offset atIndex:1];
|
||||
|
@ -397,12 +447,14 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
threads.first.width = UP_DIV(threads.first.width, threads.second.width);
|
||||
[encoder dispatchThreadgroups:threads.first threadsPerThreadgroup:threads.second];
|
||||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
commit();
|
||||
//[ctx wait];
|
||||
} else {
|
||||
[ctx wait];
|
||||
memcpy(device.contents, src->host<uint8_t>(), src->size());
|
||||
[ctx commit];
|
||||
if (needWait) {
|
||||
wait();
|
||||
}
|
||||
memcpy((uint8_t*)device.contents + TensorUtils::getDescribe(dst)->extra.offset, src->host<uint8_t>(), src->size());
|
||||
commit();
|
||||
//[ctx wait];
|
||||
}
|
||||
}
|
||||
|
@ -410,21 +462,23 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
else {
|
||||
|
||||
auto buffer = getHostBuffer(src->elementSize() * sizeof(float));
|
||||
[ctx wait];// make sure previous gpu task finished. for reuse mHostBuffer and mShapeH2D
|
||||
if (needWait) {
|
||||
wait();
|
||||
}
|
||||
auto size = getTensorShape(mShapeH2D, src);
|
||||
memcpy(buffer.contents, src->host<float>(), src->size());
|
||||
auto encoder = [ctx encoder];
|
||||
auto encoder = [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||||
auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, Down);
|
||||
MNN_ASSERT(kernel != nil); // unsupported sfmt to dfmt
|
||||
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder];
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder fp16:mUseFloatAsFp16];
|
||||
|
||||
[encoder setBuffer:buffer offset:0 atIndex:0];
|
||||
[encoder setBuffer:device offset:TensorUtils::getDescribe(dst)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mShapeH2D offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
commit();
|
||||
//[ctx wait];
|
||||
}
|
||||
}
|
||||
|
@ -437,14 +491,14 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
|
|||
auto floats = src->getType().code == halide_type_float;
|
||||
// cast
|
||||
if (sfmt == dfmt || src->dimensions() <= 1) {
|
||||
if (floats) {
|
||||
if (floats && mUseFloatAsFp16) {
|
||||
auto eleSize = dst->elementSize();
|
||||
eleSize = UP_DIV(eleSize, 4) * 4;
|
||||
auto buffer = getHostBuffer(eleSize * dst->getType().bytes());
|
||||
|
||||
NSUInteger size = src->elementSize();
|
||||
auto encoder = [ctx encoder];
|
||||
auto bandwidth = [ctx load: @"upcast_float4" encoder:encoder];
|
||||
auto encoder = [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||||
auto bandwidth = [ctx load: @"upcast_float4" encoder:encoder fp16:mUseFloatAsFp16];
|
||||
[encoder setBuffer:device offset:TensorUtils::getDescribe(src)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:buffer offset:0 atIndex:1];
|
||||
auto sizeC4 = UP_DIV(size, 4);
|
||||
|
@ -465,32 +519,32 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
|
|||
[encoder dispatchThreadgroups:threads.first threadsPerThreadgroup:threads.second];
|
||||
|
||||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
commit();
|
||||
wait();
|
||||
|
||||
memcpy(dst->host<float>(), buffer.contents, dst->size());
|
||||
} else {
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
memcpy(dst->host<uint8_t>(), device.contents, dst->size());
|
||||
commit();
|
||||
wait();
|
||||
memcpy(dst->host<uint8_t>(), (uint8_t*)device.contents + TensorUtils::getDescribe(src)->extra.offset, dst->size());
|
||||
}
|
||||
}
|
||||
// convert
|
||||
else {
|
||||
auto size = getTensorShape(mShapeD2H, src);
|
||||
auto buffer = getHostBuffer(dst->size());
|
||||
auto encoder = [ctx encoder];
|
||||
auto encoder = [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||||
auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, Up);
|
||||
MNN_ASSERT(kernel != nil); // unsupported sfmt to dfmt
|
||||
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder];
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder fp16:mUseFloatAsFp16];
|
||||
[encoder setBuffer:device offset:TensorUtils::getDescribe(src)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:buffer offset:0 atIndex:1];
|
||||
[encoder setBuffer:mShapeD2H offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
commit();
|
||||
wait();
|
||||
memcpy(dst->host<float>(), buffer.contents, dst->size());
|
||||
}
|
||||
}
|
||||
|
@ -499,7 +553,7 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
|||
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
auto standalone = encoder == nil;
|
||||
encoder = encoder ?: [ctx encoder];
|
||||
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
|
||||
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
|
||||
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
|
||||
|
||||
|
@ -507,7 +561,7 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
|||
if (sfmt == dfmt || src->dimensions() <= 1) {
|
||||
auto flt = dst->getType().code == halide_type_float;
|
||||
auto size = flt ? dst->elementSize() : dst->size();
|
||||
auto bandwidth = [ctx load:flt ? @"copy_float" : @"copy_byte" encoder:encoder];
|
||||
auto bandwidth = [ctx load:flt ? @"copy_float" : @"copy_byte" encoder:encoder fp16:mUseFloatAsFp16];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)src->deviceId())->getBuffer() offset:TensorUtils::getDescribe(src)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)dst->deviceId())->getBuffer() offset:TensorUtils::getDescribe(dst)->extra.offset atIndex:1];
|
||||
[ctx dispatchEncoder:encoder threads:{(NSUInteger)size, 1, 1} bandwidth:bandwidth];
|
||||
|
@ -521,7 +575,7 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
|||
}
|
||||
|
||||
auto size = getTensorShape(shape, src);
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder];
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder fp16:mUseFloatAsFp16];
|
||||
[encoder setBuffer:( id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)(src->buffer().device))->getBuffer() offset:TensorUtils::getDescribe(src)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:( id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)(dst->buffer().device))->getBuffer() offset:TensorUtils::getDescribe(dst)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
|
@ -538,16 +592,15 @@ void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
|
|||
flushEncoder();
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
if(!mFrameEncodeCache) {
|
||||
[ctx commit_net];
|
||||
commit_net();
|
||||
}
|
||||
|
||||
onCopyBuffer(src, dst, nil, nil);
|
||||
}
|
||||
|
||||
id<MTLComputeCommandEncoder> MetalBackend::encoder() const {
|
||||
id<MTLComputeCommandEncoder> MetalBackend::encoder_for_net() const {
|
||||
if (nil == mComputeEncoder) {
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
mComputeEncoder = [ctx encoder_net];//TO DO :: use which cmdBuffer
|
||||
mComputeEncoder = encoder_net();//TO DO :: use which cmdBuffer
|
||||
}
|
||||
return mComputeEncoder;
|
||||
}
|
||||
|
@ -570,14 +623,99 @@ void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComp
|
|||
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
|
||||
flushEncoder();
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
[ctx commit_net];
|
||||
commit_net();
|
||||
if (toCpu) {
|
||||
[ctx wait];
|
||||
wait();
|
||||
}
|
||||
mFrameEncodeCache = false;
|
||||
mOpEncoderSet = false;
|
||||
return 0;
|
||||
}
|
||||
id<MTLCommandBuffer> MetalBackend::getCommandBufferForBufferCopy() const {
|
||||
if (nil == _commandBuffer) {
|
||||
_commandBuffer = [_commandQueue commandBuffer];
|
||||
if (!mSupportDeferEncode) {
|
||||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||||
_commandBuffer_net = _commandBuffer;
|
||||
}
|
||||
}
|
||||
return _commandBuffer;
|
||||
}
|
||||
id<MTLCommandBuffer> MetalBackend::getCommandBufferForNet() const {
|
||||
if (nil == _commandBuffer_net) {
|
||||
_commandBuffer_net = [_commandQueue commandBuffer];
|
||||
if (!mSupportDeferEncode) {
|
||||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||||
_commandBuffer = _commandBuffer_net;
|
||||
}
|
||||
}
|
||||
return _commandBuffer_net;
|
||||
}
|
||||
|
||||
void MetalBackend::commit() const {
|
||||
if (nil != _commandBuffer && _commandBuffer.status < MTLCommandBufferStatusCommitted) {
|
||||
[_commandBuffer commit];
|
||||
_waiting = _commandBuffer;
|
||||
_commandBuffer = nil;
|
||||
if (!mSupportDeferEncode) {
|
||||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||||
_commandBuffer_net = nil;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MetalBackend::commit_net() const {
|
||||
if (nil != _commandBuffer_net && _commandBuffer_net.status < MTLCommandBufferStatusCommitted) {
|
||||
[_commandBuffer_net commit];
|
||||
_waiting = _commandBuffer_net;
|
||||
_commandBuffer_net = nil;
|
||||
if (!mSupportDeferEncode) {
|
||||
// In this case _commandBuffer should be the same as _commandBuffer_net
|
||||
_commandBuffer = nil;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MetalBackend::wait() const {
|
||||
if (nil != _waiting) {
|
||||
auto buffer = _waiting;
|
||||
if (buffer.status >= MTLCommandBufferStatusCompleted) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if MNN_METAL_BENCHMARK
|
||||
NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
|
||||
[buffer waitUntilCompleted];
|
||||
NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
|
||||
if (@available(iOS 10.3, *)) {
|
||||
printf("[METAL] commit costs: %.3fms\t(kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
|
||||
(buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
|
||||
(buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
|
||||
} else {
|
||||
printf("[METAL] commit costs: %.3fms\n", (end - begin) * 1000.f);
|
||||
}
|
||||
#else
|
||||
[buffer waitUntilCompleted];
|
||||
#endif
|
||||
|
||||
#if MNN_METAL_DEBUG
|
||||
if (buffer.error) {
|
||||
printf("[METAL] %s\n", buffer.error.localizedDescription.UTF8String);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
_waiting = nil;
|
||||
}
|
||||
|
||||
id<MTLComputePipelineState> MetalBackend::makeComputePipelineWithSourceOption(const char* csource, const char* cname, MTLCompileOptions *options) const{
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
auto source = [[NSString alloc] initWithUTF8String:csource];
|
||||
auto name = [[NSString alloc] initWithUTF8String:cname];
|
||||
return [ctx pipelineWithSourceOption:source name:name options:options];
|
||||
}
|
||||
void MetalRuntime::setCommandQueue(id<MTLCommandQueue> queue) {
|
||||
mQueue = queue;
|
||||
}
|
||||
|
||||
void MetalRuntime::setGpuMode(const int mode_num) {
|
||||
int totalSet = 0;
|
||||
|
@ -642,9 +780,6 @@ MetalRuntime* MetalRuntime::create(const Backend::Info& info, id<MTLDevice> devi
|
|||
if (nil == sharedContext.device) {
|
||||
sharedContext.device = device;
|
||||
}
|
||||
if (nil == sharedContext.queue) {
|
||||
sharedContext.queue = [sharedContext.device newCommandQueue];
|
||||
}
|
||||
auto mContext = (__bridge_retained void *)[[MNNMetalContext alloc] init];
|
||||
auto ctx = (__bridge MNNMetalContext *)mContext;
|
||||
BOOL res = [ctx initWithSharedContext:&sharedContext dev:device];
|
||||
|
@ -654,6 +789,18 @@ MetalRuntime* MetalRuntime::create(const Backend::Info& info, id<MTLDevice> devi
|
|||
}
|
||||
auto rt = new MetalRuntime(mContext);
|
||||
rt->setGpuMode(info.gpuMode);
|
||||
if (nil != sharedContext.queue) {
|
||||
rt->setCommandQueue(sharedContext.queue);
|
||||
}
|
||||
#ifdef MNN_METAL_TEST
|
||||
else {
|
||||
id<MTLCommandQueue> queue = [sharedContext.device newCommandQueue];
|
||||
rt->setCommandQueue(queue);
|
||||
}
|
||||
#endif
|
||||
if (nullptr != info.user) {
|
||||
rt->mDefaultConfig = *info.user;
|
||||
}
|
||||
return rt;
|
||||
}
|
||||
|
||||
|
@ -833,7 +980,12 @@ bool MetalRuntime::onMeasure(const std::vector<Tensor*>& inputs, const std::vect
|
|||
}
|
||||
|
||||
Backend* MetalRuntime::onCreate(const BackendConfig* config) const {
|
||||
return new MetalBackend(mStatic, this);
|
||||
BackendConfig::PrecisionMode precision = mDefaultConfig.precision;
|
||||
if (nullptr != config) {
|
||||
precision = config->precision;
|
||||
}
|
||||
bool useFp16AsFp32 = precision != BackendConfig::Precision_High;
|
||||
return new MetalBackend(mStatic, this, useFp16AsFp32);
|
||||
}
|
||||
|
||||
void MetalRuntime::onGabageCollect(int level) {
|
||||
|
@ -895,6 +1047,9 @@ void registerMetalRuntimeCreator() {
|
|||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||
if (nil != device) {
|
||||
registerMetalOps();
|
||||
#ifdef MNN_SUPPORT_RENDER
|
||||
registerMetalRenderOps();
|
||||
#endif
|
||||
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator(device), false);
|
||||
} else {
|
||||
MNN_ERROR("Init Metal Error\n");
|
||||
|
|
|
@ -9,17 +9,16 @@
|
|||
#ifndef MetalBinary_hpp
|
||||
#define MetalBinary_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalDefine.h"
|
||||
#import "MetalExecution.hpp"
|
||||
#include <string>
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalBinary : public Execution {
|
||||
class MetalBinary : public MetalExecution {
|
||||
public:
|
||||
MetalBinary(Backend *backend, std::string type, const MNN::Op *op);
|
||||
virtual ~MetalBinary() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -14,13 +14,13 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalBinary::MetalBinary(Backend *backend, std::string type, const MNN::Op *op) : Execution(backend) {
|
||||
MetalBinary::MetalBinary(Backend *backend, std::string type, const MNN::Op *op) : MetalExecution(backend) {
|
||||
auto mKernelName = "binary_" + type + "_x1";
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
mConstBuffer = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
|
||||
mPipeline = [context pipelineWithName:kn];
|
||||
mPipeline = [context pipelineWithName:kn fp16:mtbn->useFp16InsteadFp32()];
|
||||
mActivationType = op->main_as_BinaryOp()->activationType();
|
||||
}
|
||||
ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
@ -39,32 +39,14 @@ ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
void MetalBinary::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input0->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input0)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input1->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input1)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:2];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
#define CHECK(t, i) if (originOp == t) return i;
|
||||
|
|
|
@ -9,22 +9,23 @@
|
|||
#ifndef MetalCast_hpp
|
||||
#define MetalCast_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalDefine.h"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "Type_generated.h"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalCast : public Execution {
|
||||
class MetalCast : public MetalExecution {
|
||||
public:
|
||||
MetalCast(Backend *backend, DataType srcType, DataType dstType);
|
||||
MetalCast(Backend *backend, id<MTLComputePipelineState> pipeline);
|
||||
virtual ~MetalCast() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
private:
|
||||
DataType mSrcType;
|
||||
DataType mDstType;
|
||||
id<MTLBuffer> mConstBuffer;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
@ -13,40 +13,55 @@
|
|||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
static const char* gCastTemplate =
|
||||
R"glsl(
|
||||
#include <metal_stdlib>
|
||||
using namespace metal;
|
||||
kernel void main0(const device T0 *in [[buffer(0)]],
|
||||
device T1 *out [[buffer(1)]],
|
||||
device uint4& s [[buffer(2)]],
|
||||
uint3 gid [[thread_position_in_grid]]) {
|
||||
if (gid.x < (uint)s.x) {
|
||||
int off = gid.x;
|
||||
T0 x = in[off];
|
||||
T1 y;
|
||||
y.x = x.x;
|
||||
y.y = x.y;
|
||||
y.z = x.z;
|
||||
y.w = x.w;
|
||||
TRANSOFRM;
|
||||
out[off] = y;
|
||||
}
|
||||
}
|
||||
)glsl";
|
||||
|
||||
MetalCast::MetalCast(Backend *backend, DataType srcType, DataType dstType)
|
||||
: Execution(backend), mSrcType(srcType), mDstType(dstType) {
|
||||
// nothing to do
|
||||
MetalCast::MetalCast(Backend *backend, id<MTLComputePipelineState> pipeline)
|
||||
: MetalExecution(backend) {
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
mPipeline = pipeline;
|
||||
mConstBuffer = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
}
|
||||
ErrorCode MetalCast::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto mtbn = static_cast<MetalBackend *>(backend());
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
auto input = inputs[0];
|
||||
auto element = input->elementSize();
|
||||
auto sizeDiv4 = UP_DIV(element, 4);
|
||||
((int *)mConstBuffer.contents)[0] = sizeDiv4;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(sizeDiv4, 1, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalCast::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
void MetalCast::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
|
||||
NSString *kernel = nil;
|
||||
if (mSrcType == DataType_DT_FLOAT && mDstType == DataType_DT_INT32) {
|
||||
kernel = @"cast_float_to_int32";
|
||||
} else if (mSrcType == DataType_DT_INT32 && mDstType == DataType_DT_FLOAT) {
|
||||
kernel = @"cast_int32_to_float";
|
||||
} else if (mSrcType == DataType_DT_UINT8 && mDstType == DataType_DT_FLOAT) {
|
||||
kernel = @"cast_uint8_to_float";
|
||||
} else if (mSrcType == DataType_DT_UINT8 && mDstType == DataType_DT_INT32) {
|
||||
kernel = @"cast_uint8_to_int";
|
||||
} else if (mSrcType == DataType_DT_FLOAT && mDstType == DataType_DT_UINT8) {
|
||||
kernel = @"cast_float_to_uint8";
|
||||
} else {
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
|
||||
auto encoder = backend->encoder();
|
||||
auto bandwidth = [context load:kernel encoder:encoder];
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) output->elementSize(), (NSUInteger)1, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
return NO_ERROR;
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
}
|
||||
static DataType _mapDataType(DataType src) {
|
||||
if (DataType_DT_BOOL == src) {
|
||||
|
@ -63,27 +78,88 @@ static DataType _mapDataType(DataType src) {
|
|||
class MetalCastCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
|
||||
auto cast = op->main_as_CastParam();
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
MTLCompileOptions *compileOptions = [[MTLCompileOptions alloc] init];
|
||||
NSString* T0 = nil;
|
||||
NSString* T1 = nil;
|
||||
NSString* TRANSOFRM = @"";
|
||||
auto dstT = op->main_as_CastParam()->dstT();
|
||||
if (dstT == DataType_DT_BOOL) {
|
||||
TRANSOFRM = @"y=select(int4(0),int4(1),y>0);";
|
||||
}
|
||||
auto dstType = _mapDataType(dstT);
|
||||
bool useFp16 = mtbn->useFp16InsteadFp32();
|
||||
switch (dstType) {
|
||||
case DataType_DT_FLOAT:
|
||||
if (useFp16) {
|
||||
T1 = @"half4";
|
||||
} else {
|
||||
T1 = @"float4";
|
||||
}
|
||||
break;
|
||||
case DataType_DT_INT8:
|
||||
T1 = @"char4";
|
||||
break;
|
||||
case DataType_DT_UINT8:
|
||||
T1 = @"uchar4";
|
||||
break;
|
||||
case DataType_DT_INT32:
|
||||
T1 = @"int4";
|
||||
break;
|
||||
default:
|
||||
MNN_ERROR("Don't support cast dst : %d\n", dstType);
|
||||
return nullptr;
|
||||
break;
|
||||
}
|
||||
auto srcType = inputs[0]->getType();
|
||||
auto dst = _mapDataType(cast->dstT());
|
||||
switch (srcType.code) {
|
||||
case halide_type_float:
|
||||
if (useFp16) {
|
||||
T0 = @"half4";
|
||||
} else {
|
||||
T0 = @"float4";
|
||||
}
|
||||
break;
|
||||
case halide_type_int:
|
||||
{
|
||||
if (srcType.bits == 32) {
|
||||
T0 = @"int4";
|
||||
} else if (srcType.bits == 8) {
|
||||
T0 = @"char4";
|
||||
} else {
|
||||
MNN_ERROR("Don't support cast src : %d\n", srcType.code);
|
||||
return nullptr;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case halide_type_uint:
|
||||
{
|
||||
if (srcType.bits == 32) {
|
||||
T0 = @"uint4";
|
||||
} else if (srcType.bits == 8) {
|
||||
T0 = @"uchar4";
|
||||
} else {
|
||||
MNN_ERROR("Don't support cast src : %d\n", srcType.code);
|
||||
return nullptr;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
MNN_ERROR("Don't support cast src : %d\n", srcType.code);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (srcType.code == halide_type_float && dst == DataType_DT_INT32) {
|
||||
return new MetalCast(backend, DataType_DT_FLOAT, dst);
|
||||
compileOptions.preprocessorMacros = @{
|
||||
@"T0" : T0,
|
||||
@"T1" : T1,
|
||||
@"TRANSOFRM" : TRANSOFRM
|
||||
};
|
||||
auto pipeline = mtbn->makeComputePipelineWithSourceOption(gCastTemplate, "main0", compileOptions);
|
||||
if (nil == pipeline) {
|
||||
MNN_ERROR("Create Cast execution error for metal\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (srcType.code == halide_type_int && srcType.bits == 32 && dst == DataType_DT_FLOAT) {
|
||||
return new MetalCast(backend, DataType_DT_INT32, dst);
|
||||
}
|
||||
if (srcType.code == halide_type_float && dst == DataType_DT_UINT8) {
|
||||
return new MetalCast(backend, DataType_DT_FLOAT, dst);
|
||||
}
|
||||
if (srcType.code == halide_type_uint && srcType.bits == 8 && dst == DataType_DT_FLOAT) {
|
||||
return new MetalCast(backend, DataType_DT_UINT8, dst);
|
||||
}
|
||||
if (srcType.code == halide_type_uint && srcType.bits == 8 && dst == DataType_DT_INT32) {
|
||||
return new MetalCast(backend, DataType_DT_UINT8, dst);
|
||||
}
|
||||
MNN_PRINT("%d, %d - %d\n", srcType.code, srcType.bits, dst);
|
||||
return NULL;
|
||||
return new MetalCast(backend, pipeline);
|
||||
}
|
||||
};
|
||||
REGISTER_METAL_OP_CREATOR(MetalCastCreator, OpType_Cast);
|
||||
|
|
|
@ -3,14 +3,15 @@ import sys
|
|||
from os import listdir
|
||||
from os.path import isfile, join
|
||||
import makeshader
|
||||
shaderPath=sys.argv[1]
|
||||
cppPath= shaderPath + "/MetalOPRegister.mm"
|
||||
metalSourcePath=sys.argv[1]
|
||||
renderPath = os.path.join(metalSourcePath, "render")
|
||||
cppPath= os.path.join(metalSourcePath, "MetalOPRegister.mm")
|
||||
cppRenderPath = os.path.join(renderPath, 'MetalRenderOpRegister.mm')
|
||||
def genRegister():
|
||||
shaders=[]
|
||||
for root, dirs, files in os.walk(shaderPath):
|
||||
for file in files:
|
||||
for file in os.listdir(metalSourcePath):
|
||||
if file.endswith('.mm'):
|
||||
shaders.append(os.path.join(root,file))
|
||||
shaders.append(os.path.join(metalSourcePath,file))
|
||||
with open(cppPath,"w") as f:
|
||||
f.write("// This file is generated by Shell for ops register\n")
|
||||
f.write("#import \"backend/metal/MetalDefine.h\"\n")
|
||||
|
@ -31,19 +32,48 @@ def genRegister():
|
|||
for func in funcs:
|
||||
f.write(" "+func+"\n")
|
||||
f.write("}\n#endif\n}")
|
||||
if os.path.isdir(renderPath):
|
||||
shaders=[]
|
||||
for file in os.listdir(renderPath):
|
||||
if file.endswith('.mm'):
|
||||
shaders.append(os.path.join(renderPath,file))
|
||||
with open(cppRenderPath,"w") as f:
|
||||
f.write("// This file is generated by Shell for ops register\n")
|
||||
f.write("#import \"backend/metal/MetalDefine.h\"\n")
|
||||
f.write(" namespace MNN {\n")
|
||||
f.write("#if MNN_METAL_ENABLED\n")
|
||||
funcs=[]
|
||||
for shapath in shaders:
|
||||
with open(shapath,"r") as sha:
|
||||
lines=sha.readlines()
|
||||
for l in lines:
|
||||
if l.startswith("REGISTER_METAL_OP_CREATOR("):
|
||||
x=l.replace("REGISTER_METAL_OP_CREATOR(","").replace(")","").replace(" ","").replace(";","").replace("\n","").split(",")
|
||||
funcname="___"+x[0]+"__"+x[1]+"__();"
|
||||
funcs.append(funcname)
|
||||
f.write(" extern void "+funcname+"\n")
|
||||
pass
|
||||
f.write("void registerMetalRenderOps() {\n")
|
||||
for func in funcs:
|
||||
f.write(" "+func+"\n")
|
||||
f.write("}\n#endif\n}")
|
||||
|
||||
def genSchema():
|
||||
FLATC = shaderPath + "/../../../3rd_party/flatbuffers/tmp/flatc"
|
||||
sourceFile = shaderPath + "/schema/MetalCache.fbs"
|
||||
destFile = shaderPath + "/"
|
||||
FLATC = metalSourcePath + "/../../../3rd_party/flatbuffers/tmp/flatc"
|
||||
sourceFile = metalSourcePath + "/schema/MetalCache.fbs"
|
||||
destFile = metalSourcePath + "/"
|
||||
cmd = FLATC + " -c " + sourceFile +" --gen-object-api" +" --reflect-names"
|
||||
print(cmd)
|
||||
print(os.popen(cmd).read())
|
||||
return
|
||||
|
||||
def genShader():
|
||||
if os.path.isdir(renderPath):
|
||||
print("Has Render")
|
||||
shaders = makeshader.findAllShader("render/shader")
|
||||
makeshader.generateFile(os.path.join(renderPath, "AllRenderShader.hpp"), os.path.join(renderPath, "AllRenderShader.cpp"), shaders)
|
||||
shaders = makeshader.findAllShader("shader")
|
||||
makeshader.generateFile("AllShader.hpp", "AllShader.cpp", shaders)
|
||||
makeshader.generateFile(os.path.join(metalSourcePath, "AllShader.hpp"), os.path.join(metalSourcePath, "AllShader.cpp"), shaders)
|
||||
|
||||
if __name__ == '__main__':
|
||||
genRegister()
|
||||
|
|
|
@ -21,7 +21,7 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
virtual void onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
private:
|
||||
std::string mParam;
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
#import "core/Macro.h"
|
||||
#import "backend/metal/MetalBackend.hpp"
|
||||
#import "backend/metal/MetalConvolution1x1.hpp"
|
||||
#import "backend/metal/MetalConvolutionGEMM.hpp"
|
||||
#import "backend/metal/MetalConvolutionWinograd.hpp"
|
||||
#include <string>
|
||||
|
||||
|
@ -26,6 +25,7 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
|
||||
// prepare
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto mtbn = backend;
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
|
@ -92,13 +92,13 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
NSUInteger gid_y = oh;
|
||||
NSUInteger gid_z = UP_DIV(oc_4, packC) * ob;
|
||||
|
||||
mPipeline = [context pipelineWithName:kernelName];
|
||||
mPipeline = [context pipelineWithName:kernelName fp16:backend->useFp16InsteadFp32()];
|
||||
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
||||
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
||||
mConstBuffer, mWeight, mBias, nil];
|
||||
|
||||
std::string name = [kernelName UTF8String] + mParam;
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name queue:backend->queue()];
|
||||
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
||||
} else {
|
||||
const int total_kernel = 5;
|
||||
|
@ -130,13 +130,13 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
mConstBuffer, mWeight, mBias, nil];
|
||||
|
||||
for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
|
||||
id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx]];
|
||||
id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx] fp16:mtbn->useFp16InsteadFp32()];
|
||||
NSUInteger gid_x = UP_DIV(ow, itemW[knl_idx]);
|
||||
NSUInteger gid_y = UP_DIV(oh, itemH[knl_idx]);
|
||||
NSUInteger gid_z = UP_DIV(oc_4, itemC[knl_idx]) * ob;
|
||||
|
||||
std::string name = [shaderName[knl_idx] UTF8String] + mParam;
|
||||
auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name queue:backend->queue()];
|
||||
|
||||
if(min_cost.first > std::get<2>(ret)) {
|
||||
min_cost.first = std::get<2>(ret);
|
||||
|
@ -148,22 +148,13 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
// printf("conv idx:%d, min_cost:%d\n", (int)min_cost.second, (int)min_cost.first);
|
||||
// std::string tmp = [shaderName[min_cost.second] UTF8String];
|
||||
// printf("!!~ %s\n", tmp.c_str());
|
||||
mPipeline = [context pipelineWithName:shaderName[min_cost.second]];
|
||||
mPipeline = [context pipelineWithName:shaderName[min_cost.second] fp16:mtbn->useFp16InsteadFp32()];
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
void MetalConvolution::onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto oc_4 = UP_DIV(output->channel(), 4);
|
||||
auto encoder = backend->encoder();
|
||||
|
||||
auto bandwidth = (MetalBandwidth){mPipeline.threadExecutionWidth, mPipeline.maxTotalThreadsPerThreadgroup, NO};
|
||||
|
||||
|
@ -175,18 +166,6 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
|
|||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
//need to commit
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalConvolutionCreator : public MetalBackend::Creator {
|
||||
|
@ -207,9 +186,6 @@ public:
|
|||
if (MetalConvolutionWinograd::isValid(conv, inputs[0], outputs[0])) {
|
||||
return new MetalConvolutionWinograd(backend, input, op);
|
||||
}
|
||||
if (MetalConvolutionGEMM::isValid(conv, input)) {
|
||||
return new MetalConvolutionGEMM(backend, input, op);
|
||||
}
|
||||
if (MetalConvolution1x1::isValid(conv, input)) {
|
||||
return new MetalConvolution1x1(backend, op);
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
virtual void onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) override;
|
||||
private:
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
|
|
|
@ -57,7 +57,7 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
|||
NSUInteger gid_y = oc_4;
|
||||
NSUInteger gid_z = ob;
|
||||
|
||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z8"];
|
||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z8" fp16:backend->useFp16InsteadFp32()];
|
||||
|
||||
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
||||
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
||||
|
@ -65,14 +65,14 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
|||
|
||||
std::string name = "conv1x1_g1z8";
|
||||
MetalRuntime *rt = (MetalRuntime *)backend->runtime();
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name queue:backend->queue()];
|
||||
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
||||
} else {
|
||||
NSUInteger gid_x = UP_DIV(ow * oh, 4);
|
||||
NSUInteger gid_y = oc_4;
|
||||
NSUInteger gid_z = ob;
|
||||
|
||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z4"];
|
||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z4" fp16:backend->useFp16InsteadFp32()];
|
||||
|
||||
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
||||
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
||||
|
@ -80,7 +80,7 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
|||
|
||||
std::string name = "conv1x1_g1z4";
|
||||
MetalRuntime *rt = (MetalRuntime *)backend->runtime();
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name queue:backend->queue()];
|
||||
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
||||
//printf("conv1x1_z4, %d %d %d %d\n", ow, oh, oc_4, ic_4);
|
||||
}
|
||||
|
@ -100,13 +100,13 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
|||
mConstBuffer, mWeight, mBias, nil];
|
||||
|
||||
for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
|
||||
id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx]];
|
||||
id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx] fp16:backend->useFp16InsteadFp32()];
|
||||
NSUInteger gid_x = UP_DIV(ow, itemW[knl_idx]);
|
||||
NSUInteger gid_y = UP_DIV(oh, itemH[knl_idx]);
|
||||
NSUInteger gid_z = ob * UP_DIV(oc, itemC[knl_idx]);
|
||||
|
||||
std::string name = [shaderName[knl_idx] UTF8String];
|
||||
auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name queue:backend->queue()];
|
||||
|
||||
if(min_cost.first > std::get<2>(ret)) {
|
||||
min_cost.first = std::get<2>(ret);
|
||||
|
@ -116,21 +116,13 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
|||
//printf("conv1x1 idx:%d, global:%d %d %d, local:%d %d %d, min_cost:%d\n", knl_idx, (int)retTune.second.first.width, (int)retTune.second.first.height, (int)retTune.second.first.depth, (int)retTune.second.second.width, (int)retTune.second.second.height, (int)retTune.second.second.depth, (int)retTune.first);
|
||||
}
|
||||
//printf("conv1x1 idx:%d, min_cost:%d\n", (int)min_cost.second, (int)min_cost.first);
|
||||
mPipeline = [context pipelineWithName:shaderName[min_cost.second]];
|
||||
mPipeline = [context pipelineWithName:shaderName[min_cost.second] fp16:backend->useFp16InsteadFp32()];
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolution1x1::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto encoder = backend->encoder();
|
||||
void MetalConvolution1x1::onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) {
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
|
@ -138,17 +130,6 @@ ErrorCode MetalConvolution1x1::onFloat(const Tensor *input, const Tensor *output
|
|||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
} // namespace MNN
|
||||
#endif /* MNN_METAL_ENABLED */
|
||||
|
|
|
@ -11,21 +11,22 @@
|
|||
|
||||
#import "core/ConvolutionCommon.hpp"
|
||||
#import "MetalBackend.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNNMetalContext.h"
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalConvolutionCommon : public Execution {
|
||||
class MetalConvolutionCommon : public MetalExecution {
|
||||
public:
|
||||
MetalConvolutionCommon(Backend *backend, const MNN::Op *op);
|
||||
virtual ~MetalConvolutionCommon() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
protected:
|
||||
void loadWeight(const MNN::Convolution2D *conv);
|
||||
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) = 0;
|
||||
virtual void onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) = 0;
|
||||
virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src);
|
||||
|
||||
private:
|
||||
|
|
|
@ -16,23 +16,32 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
static id<MTLBuffer> biasForConv(MNNMetalContext *context, const Convolution2D *conv) {
|
||||
static id<MTLBuffer> biasForConv(MNNMetalContext *context, const Convolution2D *conv, bool fp16) {
|
||||
auto bias = conv->bias();
|
||||
auto oc = conv->common()->outputCount();
|
||||
auto bias_size = UP_DIV(oc, 16) * 16 * sizeof(metal_float);
|
||||
int bytes = 4;
|
||||
if (fp16) {
|
||||
bytes = 2;
|
||||
}
|
||||
auto bias_size = UP_DIV(oc, 16) * 16 *bytes;
|
||||
auto buffer = [context newDeviceBuffer:bias_size access:CPUWriteOnly];
|
||||
auto src = bias->data();
|
||||
auto dst = (metal_float *)buffer.contents;
|
||||
::memset(dst, 0, bias_size);
|
||||
::memset(buffer.contents, 0, bias_size);
|
||||
if (fp16) {
|
||||
auto dst = (__fp16 *)buffer.contents;
|
||||
#pragma clang loop vectorize(enable) unroll(enable)
|
||||
for (int i = 0; i < oc; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
} else {
|
||||
::memcpy(buffer.contents, src, oc * sizeof(float));
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
MetalConvolutionCommon::MetalConvolutionCommon(Backend *backend, const MNN::Op *op) : Execution(backend) {
|
||||
MetalConvolutionCommon::MetalConvolutionCommon(Backend *backend, const MNN::Op *op) : MetalExecution(backend) {
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
auto mtbn = static_cast<MetalBackend*>(backend);
|
||||
auto conv = op->main_as_Convolution2D();
|
||||
auto common = conv->common();
|
||||
mOp = op;
|
||||
|
@ -47,7 +56,7 @@ MetalConvolutionCommon::MetalConvolutionCommon(Backend *backend, const MNN::Op *
|
|||
mStrideY = common->strideY();
|
||||
mDilateX = common->dilateX();
|
||||
mDilateY = common->dilateY();
|
||||
mBias = biasForConv(context, conv);
|
||||
mBias = biasForConv(context, conv, mtbn->useFp16InsteadFp32());
|
||||
mActivationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);
|
||||
}
|
||||
|
||||
|
@ -55,8 +64,8 @@ ErrorCode MetalConvolutionCommon::onResize(const std::vector<Tensor *> &inputs,
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionCommon::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
return onFloat(inputs[0], outputs[0]);
|
||||
void MetalConvolutionCommon::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
return onFloat(inputs[0], outputs[0], encoder);
|
||||
}
|
||||
|
||||
template <typename FType, typename TType>
|
||||
|
@ -103,8 +112,10 @@ void MetalConvolutionCommon::loadWeight(const MNN::Convolution2D *conv) {
|
|||
id<MTLBuffer> MetalConvolutionCommon::weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
return weightInBlock<float, metal_float>(context, group, oc, ic, kh, kw, src);
|
||||
|
||||
if (backend->useFp16InsteadFp32()) {
|
||||
return weightInBlock<float, __fp16>(context, group, oc, ic, kh, kw, src);
|
||||
}
|
||||
return weightInBlock<float, float>(context, group, oc, ic, kh, kw, src);
|
||||
}
|
||||
|
||||
id<MTLBuffer> MetalConvolutionCommon::weightForConv(const Convolution2D *conv, ConvolutionCommon::Int8Common *qnt,
|
||||
|
|
|
@ -21,7 +21,7 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
virtual void onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) override;
|
||||
private:
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
|
|
|
@ -62,7 +62,7 @@ ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &input
|
|||
::memcpy(mConstBuffer.contents, constants, sizeof(constants));
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
mPipeline = [context pipelineWithName:@"conv_depthwise"];
|
||||
mPipeline = [context pipelineWithName:@"conv_depthwise" fp16:backend->useFp16InsteadFp32()];
|
||||
|
||||
NSUInteger gid_x = ow;
|
||||
NSUInteger gid_y = oh;
|
||||
|
@ -74,19 +74,12 @@ ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &input
|
|||
|
||||
std::string name = "conv_depthwise";
|
||||
MetalRuntime *rt = (MetalRuntime *)backend->runtime();
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name queue:backend->queue()];
|
||||
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionDepthwise::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto encoder = backend->encoder();
|
||||
void MetalConvolutionDepthwise::onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) {
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset: TensorUtils::getDescribe(input)->extra.offset
|
||||
atIndex:0];
|
||||
|
@ -96,17 +89,6 @@ atIndex:1];
|
|||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
template <typename FType, typename TType>
|
||||
|
@ -132,7 +114,10 @@ static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int kh,
|
|||
id<MTLBuffer> MetalConvolutionDepthwise::weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
return weightInBlock<float, metal_float>(context, group, kh, kw, src);
|
||||
if (backend->useFp16InsteadFp32()) {
|
||||
return weightInBlock<float, __fp16>(context, group, kh, kw, src);
|
||||
}
|
||||
return weightInBlock<float, float>(context, group, kh, kw, src);
|
||||
}
|
||||
|
||||
class MetalConvolutionDepthwiseCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
//
|
||||
// MetalConvolutionGEMM.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/31.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MetalConvolutionGEMM_hpp
|
||||
#define MetalConvolutionGEMM_hpp
|
||||
|
||||
#import "MetalConvolutionCommon.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalConvolutionGEMM : public MetalConvolutionCommon {
|
||||
public:
|
||||
static bool isValid(const Convolution2D *conv, const Tensor *input);
|
||||
MetalConvolutionGEMM(Backend *backend, const Tensor *input, const MNN::Op *op);
|
||||
virtual ~MetalConvolutionGEMM() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) override;
|
||||
|
||||
private:
|
||||
id<MTLBuffer> mShapeBuffer = nil;
|
||||
std::shared_ptr<Tensor> mTempInput;
|
||||
std::shared_ptr<Tensor> mTempOutput;
|
||||
id<MTLComputePipelineState> mPipelineGEMM;
|
||||
std::pair<MTLSize, MTLSize> mGemm;
|
||||
id<MTLComputePipelineState> mPipelineIm2Col;
|
||||
std::pair<MTLSize, MTLSize> mIm2Col;
|
||||
id<MTLComputePipelineState> mPipelineCol2Im;
|
||||
std::pair<MTLSize, MTLSize> mCol2Im;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
#endif /* MNN_METAL_ENABLED */
|
||||
#endif /* MetalConvolutionGEMM_hpp */
|
|
@ -1,214 +0,0 @@
|
|||
//
|
||||
// MetalConvolutionGEMM.mm
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/31.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#import "backend/metal/MetalConvolutionGEMM.hpp"
|
||||
#import "core/Macro.h"
|
||||
#import "core/Macro.h"
|
||||
#import "backend/metal/MetalBackend.hpp"
|
||||
#import "backend/metal/MetalConvolution.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
bool MetalConvolutionGEMM::isValid(const Convolution2D *conv, const Tensor *input) {
|
||||
auto common = conv->common();
|
||||
auto kx = common->kernelX(), ky = common->kernelY();
|
||||
if (kx == 1 || ky == 1 || common->group() != 1) {
|
||||
return false;
|
||||
}
|
||||
auto oc = common->outputCount();
|
||||
if (oc <= 16) {
|
||||
return false;
|
||||
}
|
||||
auto iw = input->width(), ih = input->height(), ic = input->channel();
|
||||
if (iw * ih * ic <= 16384) {
|
||||
return false;
|
||||
}
|
||||
auto sx = common->strideX(), ow = (iw - kx + 1) / sx;
|
||||
auto sy = common->strideY(), oh = (ih - ky + 1) / sy;
|
||||
if ((iw * ih * ic) / (ow * oh * oc) > 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto unit = conv->quanParameter() != nullptr ? sizeof(float) : sizeof(metal_float);
|
||||
auto iz = UP_DIV(ic, 4), oz = UP_DIV(oc, 4), batch = input->batch();
|
||||
return UP_DIV(ow * oh * batch, 4) * kx * ky * iz * 16 * sizeof(metal_float) < (2 << 20) && // tmp input
|
||||
UP_DIV(ow * oh * batch, 4) * oz * 16 * unit < (2 << 20); // tmp output
|
||||
}
|
||||
|
||||
MetalConvolutionGEMM::MetalConvolutionGEMM(Backend *backend, const Tensor *input, const MNN::Op *op)
|
||||
: MetalConvolutionCommon(backend, op) {
|
||||
loadWeight(op->main_as_Convolution2D());
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionGEMM::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
// prepare
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
auto iw = input->width();
|
||||
auto ih = input->height();
|
||||
auto ic_4 = UP_DIV(input->channel(), 4);
|
||||
auto ow = output->width();
|
||||
auto oh = output->height();
|
||||
auto oc_4 = UP_DIV(output->channel(), 4);
|
||||
auto ob = output->batch();
|
||||
|
||||
auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
|
||||
auto padX = pads.first;
|
||||
auto padY = pads.second;
|
||||
|
||||
// create const buffer
|
||||
int constants[] = {iw,
|
||||
ih,
|
||||
iw * ih,
|
||||
ic_4,
|
||||
ow,
|
||||
oh,
|
||||
ow * oh,
|
||||
oc_4,
|
||||
ob,
|
||||
|
||||
mKernelX,
|
||||
mKernelY,
|
||||
mKernelX * mKernelY,
|
||||
mStrideX,
|
||||
mStrideY,
|
||||
padX,
|
||||
padY,
|
||||
mDilateX,
|
||||
mDilateY,
|
||||
mActivationType};
|
||||
mConstBuffer = backend->getConstBuffer(sizeof(constants));
|
||||
::memcpy(mConstBuffer.contents, constants, sizeof(constants));
|
||||
|
||||
// create mat mul const buffer
|
||||
int shapes[] = {UP_DIV(ow * oh * ob, 4), oc_4, mKernelX * mKernelY * ic_4, 1};
|
||||
mShapeBuffer = [context newDeviceBuffer:sizeof(shapes) bytes:shapes access:CPUWriteOnly];
|
||||
|
||||
// accquire space for source & dst
|
||||
int is = UP_DIV(ow * oh * ob, 4) * mKernelX * mKernelY * ic_4 * 16 * sizeof(metal_float) / sizeof(uint8_t);
|
||||
int os = UP_DIV(ow * oh * ob, 4) * oc_4 * 16 * sizeof(metal_float) / sizeof(uint8_t);
|
||||
mTempInput.reset(Tensor::createDevice<uint8_t>(std::vector<int>{is}));
|
||||
mTempOutput.reset(Tensor::createDevice<uint8_t>(std::vector<int>{os}));
|
||||
|
||||
if (!backend->onAcquireBuffer(mTempInput.get(), Backend::DYNAMIC) ||
|
||||
!backend->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC)) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
backend->onReleaseBuffer(mTempInput.get(), Backend::DYNAMIC);
|
||||
backend->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
mPipelineGEMM = [context pipelineWithName:@"matmul4x4"];
|
||||
mPipelineIm2Col = [context pipelineWithName:@"conv_im2col"];
|
||||
mPipelineCol2Im = [context pipelineWithName:@"conv_col2im"];
|
||||
NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
|
||||
NSUInteger gh = UP_DIV(output->channel(), 4);
|
||||
|
||||
{
|
||||
NSUInteger gid_x = gw;
|
||||
NSUInteger gid_y = gh;
|
||||
NSUInteger gid_z = 1;
|
||||
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempInput->deviceId())->getBuffer(),
|
||||
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)mTempOutput->deviceId()))->getBuffer(), mWeight, mShapeBuffer, nil];
|
||||
|
||||
std::string name = "matmul4x4";
|
||||
MetalRuntime *rt = (MetalRuntime *)backend->runtime();
|
||||
auto ret = [context getGridAndThreadgroup:mPipelineGEMM gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name];
|
||||
mGemm = std::make_pair(std::get<0>(ret), std::get<1>(ret));
|
||||
}
|
||||
mIm2Col = [context computeBestGroupAndLocal:mPipelineIm2Col threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)ic_4*ob}];
|
||||
mCol2Im = [context computeBestGroupAndLocal:mPipelineCol2Im threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)oc_4*ob}];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionGEMM::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
return onFloat(inputs[0], outputs[0]);
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionGEMM::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto encoder = backend->encoder();
|
||||
{ // im2col
|
||||
[encoder setComputePipelineState:mPipelineIm2Col];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempInput->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempInput.get())->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[encoder dispatchThreadgroups:mIm2Col.first threadsPerThreadgroup:mIm2Col.second];
|
||||
}
|
||||
{ // gemm
|
||||
[encoder setComputePipelineState:mPipelineGEMM];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempInput->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempInput.get())->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempOutput->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempOutput.get())->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:2];
|
||||
[encoder setBuffer:mShapeBuffer offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mGemm.first threadsPerThreadgroup:mGemm.second];
|
||||
}
|
||||
{ // col2im
|
||||
[encoder setComputePipelineState:mPipelineCol2Im];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempOutput->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempOutput.get())->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:2];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mCol2Im.first threadsPerThreadgroup:mCol2Im.second];
|
||||
}
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
template <typename FType, typename TType>
|
||||
static id<MTLBuffer> weightInBlock(MNNMetalContext *context, int group, int oc, int ic, int kh, int kw,
|
||||
const FType *src) {
|
||||
auto oz = UP_DIV(oc, 4);
|
||||
auto iz = UP_DIV(ic, 4);
|
||||
auto buffer = [context newDeviceBuffer:oz * iz * kw * kh * 16 * sizeof(TType) access:CPUWriteOnly];
|
||||
auto dst = (TType *)buffer.contents;
|
||||
|
||||
for (int o = 0; o < oc; o++) {
|
||||
auto zo = o / 4, ro = o % 4;
|
||||
auto o_dst = dst + zo * iz * kh * kw * 16 + ro; // o/4 x 4
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int i = 0; i < ic; i++) {
|
||||
auto zi = i / 4, ri = i % 4;
|
||||
auto i_dst = o_dst + zi * kh * kw * 16 + ri * 4; // i/4 x 4
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int h = 0; h < kh; h++) {
|
||||
#pragma clang loop vectorize(enable) unroll(enable)
|
||||
for (int w = 0; w < kw; w++) {
|
||||
// to [g][o/4][i/4][h][w][16]
|
||||
// from [g][o][i][h][w]
|
||||
i_dst[(h * kw + w) * 16] = *src++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
id<MTLBuffer> MetalConvolutionGEMM::weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
return weightInBlock<float, metal_float>(context, group, oc, ic, kh, kw, src);
|
||||
}
|
||||
|
||||
} // namespace MNN
|
||||
#endif /* MNN_METAL_ENABLED */
|
|
@ -22,7 +22,7 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
virtual void onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -110,10 +110,11 @@ ErrorCode MetalConvolutionWinograd::onResize(const std::vector<Tensor *> &inputs
|
|||
mOutputTransformThreads.width = uw;
|
||||
mOutputTransformThreads.height = uh;
|
||||
mOutputTransformThreads.depth = oz;
|
||||
int bytes = backend->useFp16InsteadFp32() ? 2 : 4;
|
||||
|
||||
// accquire space
|
||||
int is = mSrcUnit * mSrcUnit * us * iz * 16 * sizeof(metal_float) / sizeof(uint8_t);
|
||||
int os = mSrcUnit * mSrcUnit * us * oz * 16 * sizeof(metal_float) / sizeof(uint8_t);
|
||||
int is = mSrcUnit * mSrcUnit * us * iz * 16 * bytes;
|
||||
int os = mSrcUnit * mSrcUnit * us * oz * 16 * bytes;
|
||||
mTempSrc.reset(Tensor::createDevice<uint8_t>(std::vector<int>{is}));
|
||||
mTempDst.reset(Tensor::createDevice<uint8_t>(std::vector<int>{os}));
|
||||
backend->onAcquireBuffer(mTempSrc.get(), Backend::DYNAMIC);
|
||||
|
@ -124,25 +125,19 @@ ErrorCode MetalConvolutionWinograd::onResize(const std::vector<Tensor *> &inputs
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *output) {
|
||||
void MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *output, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto encoder = backend->encoder();
|
||||
{ // transform
|
||||
auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_source2_3_1" : @"winograd_transform_source2_5_1" encoder:encoder];
|
||||
auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_source2_3_1" : @"winograd_transform_source2_5_1" encoder:encoder fp16:backend->useFp16InsteadFp32()];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempSrc->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempSrc.get())->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder threads:mInputTransformThreads bandwidth:bandwidth];
|
||||
}
|
||||
{ // gemm
|
||||
auto bandwidth = [context load:@"matmul4x4" encoder:encoder];
|
||||
auto bandwidth = [context load:@"matmul4x4" encoder:encoder fp16:backend->useFp16InsteadFp32()];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempSrc->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempSrc.get())->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempDst->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempDst.get())->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:2];
|
||||
|
@ -150,25 +145,13 @@ ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *o
|
|||
[context dispatchEncoder:encoder threads:mMatMulThreads bandwidth:bandwidth];
|
||||
}
|
||||
{ // transform
|
||||
auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_dest2_3_1" : @"winograd_transform_dest2_5_1" encoder:encoder];
|
||||
auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_dest2_3_1" : @"winograd_transform_dest2_5_1" encoder:encoder fp16:backend->useFp16InsteadFp32()];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)mTempDst->deviceId())->getBuffer() offset:TensorUtils::getDescribe(mTempDst.get())->extra.offset atIndex:0];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:1];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:2];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:mOutputTransformThreads bandwidth:bandwidth];
|
||||
}
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
id<MTLBuffer> MetalConvolutionWinograd::weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
@ -179,18 +162,23 @@ id<MTLBuffer> MetalConvolutionWinograd::weightForFloat(int group, int oc, int ic
|
|||
std::shared_ptr<Tensor> dstWeight = generater.allocTransformWeight(srcWeight.get(), 4, 4);
|
||||
generater.transformWeight(dstWeight.get(), srcWeight.get());
|
||||
|
||||
#if MNN_METAL_FULL_PRECISION
|
||||
auto bytes = dstWeight->host<metal_float>();
|
||||
#else
|
||||
std::shared_ptr<Tensor> dstWeightHalf(Tensor::create<int16_t>(dstWeight->shape()));
|
||||
int bytenumber = 4;
|
||||
void* bytes = nullptr;
|
||||
std::shared_ptr<Tensor> dstWeightHalf;
|
||||
if (backend->useFp16InsteadFp32()) {
|
||||
dstWeightHalf.reset(Tensor::create<int16_t>(dstWeight->shape()));
|
||||
auto f32 = dstWeight->host<float>();
|
||||
auto f16 = dstWeightHalf->host<metal_float>();
|
||||
auto f16 = dstWeightHalf->host<__fp16>();
|
||||
for (int i = 0; i < dstWeight->elementSize(); ++i) {
|
||||
f16[i] = f32[i];
|
||||
}
|
||||
auto bytes = dstWeightHalf->host<metal_float>();
|
||||
#endif
|
||||
return [context newDeviceBuffer:4 * UP_DIV(ic, 4) * UP_DIV(oc, 4) * mSrcUnit * mSrcUnit * 4 * sizeof(metal_float)
|
||||
bytes = dstWeightHalf->host<void>();
|
||||
bytenumber = 2;
|
||||
} else {
|
||||
bytes = dstWeight->host<float>();
|
||||
bytenumber = 4;
|
||||
}
|
||||
return [context newDeviceBuffer:4 * UP_DIV(ic, 4) * UP_DIV(oc, 4) * mSrcUnit * mSrcUnit * 4 * bytenumber
|
||||
bytes:bytes
|
||||
access:CPUWriteOnly];
|
||||
}
|
||||
|
|
|
@ -9,19 +9,17 @@
|
|||
#ifndef MetalDeconvolution_hpp
|
||||
#define MetalDeconvolution_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#import "MetalExecution.hpp"
|
||||
#include "MNN_generated.h"
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalDeconvolution : public Execution {
|
||||
class MetalDeconvolution : public MetalExecution {
|
||||
public:
|
||||
MetalDeconvolution(Backend *backend, const MNN::Op *op);
|
||||
virtual ~MetalDeconvolution() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
private:
|
||||
bool mDepthwise = false;
|
||||
|
|
|
@ -38,13 +38,9 @@ static id<MTLBuffer> weightForDeconv(MNNMetalContext *context, int group, int oc
|
|||
auto dst = (TType *)buffer.contents;
|
||||
|
||||
for (int g = 0; g < group; g++) {
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int i = 0; i < gic; i++) {
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int o = 0; o < goc; o++) {
|
||||
#pragma clang loop vectorize(enable)
|
||||
for (int h = 0; h < kh; h++) {
|
||||
#pragma clang loop vectorize(enable) unroll(enable)
|
||||
for (int w = 0; w < kw; w++) {
|
||||
auto zo = o / 4, ro = o % 4;
|
||||
auto zi = i / 4, ri = i % 4;
|
||||
|
@ -85,7 +81,8 @@ static id<MTLBuffer> weightForDepthwise(MNNMetalContext *context, int group, int
|
|||
return buffer;
|
||||
}
|
||||
|
||||
static id<MTLBuffer> weightForDeconv(MNNMetalContext *context, bool depthwise, const Convolution2D *deconv,
|
||||
template <typename TType>
|
||||
id<MTLBuffer> weightForDeconv(MNNMetalContext *context, bool depthwise, const Convolution2D *deconv,
|
||||
ConvolutionCommon::Int8Common *qnt) {
|
||||
auto size = qnt ? qnt->weightFloat.size() : deconv->weight()->size();
|
||||
auto common = deconv->common();
|
||||
|
@ -95,31 +92,43 @@ static id<MTLBuffer> weightForDeconv(MNNMetalContext *context, bool depthwise, c
|
|||
auto oc = common->outputCount();
|
||||
auto ic = size / kw / kh / (oc / group);
|
||||
if (depthwise) {
|
||||
return weightForDepthwise<float, metal_float>(context, group, kh, kw,
|
||||
return weightForDepthwise<float, TType>(context, group, kh, kw,
|
||||
qnt ? qnt->weightFloat.get() : deconv->weight()->data());
|
||||
} else {
|
||||
return weightForDeconv<float, metal_float>(context, group, oc, ic, kh, kw,
|
||||
return weightForDeconv<float, TType>(context, group, oc, ic, kh, kw,
|
||||
qnt ? qnt->weightFloat.get() : deconv->weight()->data());
|
||||
}
|
||||
}
|
||||
|
||||
static id<MTLBuffer> biasForDeconv(MNNMetalContext *context, const Convolution2D *deconv) {
|
||||
static id<MTLBuffer> biasForDeconv(MNNMetalContext *context, const Convolution2D *deconv, bool fp16) {
|
||||
auto bias = deconv->bias();
|
||||
if (!bias || bias->size() == 0)
|
||||
return [context newDeviceBuffer:0 access:CPUTransparent];
|
||||
|
||||
auto oc = deconv->common()->outputCount();
|
||||
auto buffer = [context newDeviceBuffer:UP_DIV(oc, 4) * 4 * sizeof(metal_float) access:CPUWriteOnly];
|
||||
int bytes = 4;
|
||||
if (fp16) {
|
||||
bytes = 2;
|
||||
}
|
||||
auto buffer = [context newDeviceBuffer:UP_DIV(oc, 4) * 4 * bytes access:CPUWriteOnly];
|
||||
auto src = bias->data();
|
||||
auto dst = (metal_float *)buffer.contents;
|
||||
#pragma clang loop vectorize(enable) unroll(enable)
|
||||
for (int i = 0; i < oc; i++)
|
||||
if (fp16) {
|
||||
auto dst = (__fp16 *)buffer.contents;
|
||||
for (int i = 0; i < oc; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
} else {
|
||||
auto dst = (float *)buffer.contents;
|
||||
for (int i = 0; i < oc; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : Execution(backend) {
|
||||
MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : MetalExecution(backend) {
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto deconv = op->main_as_Convolution2D();
|
||||
auto common = deconv->common();
|
||||
mOp = op;
|
||||
|
@ -141,12 +150,16 @@ MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : Ex
|
|||
if (deconv->quanParameter()) {
|
||||
qnt = ConvolutionCommon::load(deconv, backend, true);
|
||||
}
|
||||
mWeight = weightForDeconv(context, mDepthwise, deconv, qnt.get());
|
||||
mBias = biasForDeconv(context, deconv);
|
||||
if (mDepthwise) {
|
||||
mPipeline = [context pipelineWithName:@"deconv_depthwise"];
|
||||
if (mtbn->useFp16InsteadFp32()) {
|
||||
mWeight = weightForDeconv<__fp16>(context, mDepthwise, deconv, qnt.get());
|
||||
} else {
|
||||
mPipeline = [context pipelineWithName:@"deconv"];
|
||||
mWeight = weightForDeconv<float>(context, mDepthwise, deconv, qnt.get());
|
||||
}
|
||||
mBias = biasForDeconv(context, deconv, mtbn->useFp16InsteadFp32());
|
||||
if (mDepthwise) {
|
||||
mPipeline = [context pipelineWithName:@"deconv_depthwise" fp16:mtbn->useFp16InsteadFp32()];
|
||||
} else {
|
||||
mPipeline = [context pipelineWithName:@"deconv" fp16:mtbn->useFp16InsteadFp32()];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -198,17 +211,8 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalDeconvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
void MetalDeconvolution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
|
@ -216,17 +220,6 @@ ErrorCode MetalDeconvolution::onExecute(const std::vector<Tensor *> &inputs, con
|
|||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalDeconvolutionCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -19,11 +19,6 @@
|
|||
#import <float.h>
|
||||
#endif
|
||||
|
||||
#if (TARGET_OS_IPHONE && TARGET_OS_SIMULATOR)
|
||||
#undef MNN_METAL_ENABLED
|
||||
#define MNN_METAL_ENABLED 0
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#ifndef MNN_METAL_DEBUG
|
||||
#if DEBUG
|
||||
|
@ -34,14 +29,6 @@
|
|||
#endif
|
||||
|
||||
#define MNN_METAL_BENCHMARK 0
|
||||
#define MNN_METAL_FULL_PRECISION 0 // should edit in metal too
|
||||
|
||||
#if MNN_METAL_FULL_PRECISION || !defined(__FLT16_EPSILON__)
|
||||
typedef float metal_float;
|
||||
#define MNNMetalPixelFormatRGBAFloat MTLPixelFormatRGBA32Float
|
||||
#else
|
||||
typedef __fp16 metal_float;
|
||||
#define MNNMetalPixelFormatRGBAFloat MTLPixelFormatRGBA16Float
|
||||
#endif
|
||||
|
||||
#endif /* MetalDefine_h */
|
||||
|
|
|
@ -8,23 +8,21 @@
|
|||
|
||||
#ifndef MetalEltwise_hpp
|
||||
#define MetalEltwise_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalEltwise : public Execution {
|
||||
class MetalEltwise : public MetalExecution {
|
||||
public:
|
||||
MetalEltwise(Backend *backend, EltwiseType type);
|
||||
virtual ~MetalEltwise() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
void encode(const Tensor *input0, const Tensor *input1, const Tensor *output);
|
||||
void encode(const Tensor *input0, const Tensor *input1, const Tensor *output, id<MTLComputeCommandEncoder> encoder);
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
id<MTLBuffer> mConst;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend) {
|
||||
MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : MetalExecution(backend) {
|
||||
auto metal = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)metal->context();
|
||||
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
|
@ -32,7 +32,7 @@ MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backe
|
|||
default:
|
||||
break;
|
||||
}
|
||||
mPipeline = [context pipelineWithName:kernel];
|
||||
mPipeline = [context pipelineWithName:kernel fp16:metal->useFp16InsteadFp32()];
|
||||
}
|
||||
ErrorCode MetalEltwise::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
((int*)(mConst.contents))[0] = outputs[0]->elementSize();
|
||||
|
@ -43,9 +43,7 @@ ErrorCode MetalEltwise::onResize(const std::vector<Tensor *> &inputs, const std:
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
void MetalEltwise::encode(const Tensor *input0, const Tensor *input1, const Tensor *output) {
|
||||
auto metal = static_cast<MetalBackend *>(this->backend());
|
||||
auto encoder = metal->encoder();
|
||||
void MetalEltwise::encode(const Tensor *input0, const Tensor *input1, const Tensor *output, id<MTLComputeCommandEncoder> encoder) {
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input0->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input0)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input1->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input1)->extra.offset atIndex:1];
|
||||
|
@ -54,30 +52,12 @@ void MetalEltwise::encode(const Tensor *input0, const Tensor *input1, const Tens
|
|||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
}
|
||||
|
||||
ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
void MetalEltwise::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto output = outputs[0];
|
||||
encode(inputs[0], inputs[1], output);
|
||||
encode(inputs[0], inputs[1], output, encoder);
|
||||
for (int i = 2; i < inputs.size(); i++) {
|
||||
encode(inputs[i], output, output);
|
||||
encode(inputs[i], output, output, encoder);
|
||||
}
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalEltwiseCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
#ifndef MetalExecution_hpp
|
||||
#define MetalExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
#import "MetalDefine.h"
|
||||
#include <string>
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalExecution : public Execution {
|
||||
public:
|
||||
MetalExecution(Backend *backend);
|
||||
virtual ~MetalExecution() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) = 0;
|
||||
|
||||
};
|
||||
} // namespace MNN
|
||||
#endif /* MNN_METAL_ENABLED */
|
||||
|
||||
#endif
|
|
@ -0,0 +1,32 @@
|
|||
#include "MetalExecution.hpp"
|
||||
#import "backend/metal/MetalBackend.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
MetalExecution::MetalExecution(Backend *backend) : Execution(backend) {
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto encoder = backend->encoder_for_net();
|
||||
this->onEncode(inputs, outputs, encoder);
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
backend->commit_net();
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
#endif
|
|
@ -9,18 +9,17 @@
|
|||
#ifndef MetalFuse_hpp
|
||||
#define MetalFuse_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalFuse : public Execution {
|
||||
class MetalFuse : public MetalExecution {
|
||||
public:
|
||||
MetalFuse(Backend *backend, const Op* op);
|
||||
virtual ~MetalFuse() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
// #define MNN_FUSE_DEBUG
|
||||
MetalFuse::MetalFuse(Backend *backend, const Op* op) : Execution(backend), mOp(op) {
|
||||
MetalFuse::MetalFuse(Backend *backend, const Op* op) : MetalExecution(backend), mOp(op) {
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
mConstBuffer = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
|
||||
|
@ -27,9 +27,7 @@ MetalFuse::MetalFuse(Backend *backend, const Op* op) : Execution(backend), mOp(o
|
|||
#ifdef MNN_FUSE_DEBUG
|
||||
MNN_PRINT("MetalFuse srcCode:\n%s\n", srcCode);
|
||||
#endif
|
||||
auto source = [[NSString alloc] initWithUTF8String:ss.str().c_str()];
|
||||
auto name = [[NSString alloc] initWithUTF8String:extra->type()->c_str()];
|
||||
mPipeline = [context pipelineWithSource:source name:name];
|
||||
mPipeline = mtbn->makeComputePipelineWithSourceOption(ss.str().c_str(), extra->type()->c_str(), nil);
|
||||
}
|
||||
|
||||
ErrorCode MetalFuse::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
@ -43,16 +41,8 @@ ErrorCode MetalFuse::onResize(const std::vector<Tensor *> &inputs, const std::ve
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalFuse::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
void MetalFuse::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
int i = 0;
|
||||
for (; i < inputs.size(); i++) {
|
||||
|
@ -84,20 +74,153 @@ ErrorCode MetalFuse::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
MNN_PRINT("=============================\n");
|
||||
}
|
||||
#endif
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
static bool _isStandardFuse(const Op* op) {
|
||||
if (op->type() != OpType_Extra) {
|
||||
return false;
|
||||
}
|
||||
if (nullptr == op->main_as_Extra()) {
|
||||
return false;
|
||||
}
|
||||
auto extra = op->main_as_Extra();
|
||||
if (nullptr == extra->attr()) {
|
||||
return false;
|
||||
}
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "version") {
|
||||
if (nullptr != attr->s()) {
|
||||
std::string cont = attr->s()->str();
|
||||
return cont == "common";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
class MetalFuseV2 : public MetalExecution {
|
||||
public:
|
||||
MetalFuseV2(Backend *backend, const Op* op, int outputSize, int inputSize) : MetalExecution(backend) {
|
||||
mOutputBinding.resize(outputSize);
|
||||
mInputBinding.resize(inputSize);
|
||||
auto mtbn = static_cast<MetalBackend*>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
auto extra = op->main_as_Extra();
|
||||
// Find shader
|
||||
const char* source = nil;
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "metal") {
|
||||
source = attr->s()->c_str();
|
||||
break;
|
||||
}
|
||||
}
|
||||
mPipeline = mtbn->makeComputePipelineWithSourceOption(source, "main0", nil);
|
||||
|
||||
// Init size
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "group_size") {
|
||||
auto ptr = attr->tensor()->int32s()->data();
|
||||
mGroupSize.width = ptr[0];
|
||||
mGroupSize.height = ptr[1];
|
||||
mGroupSize.depth = ptr[2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "local_size") {
|
||||
auto ptr = attr->tensor()->int32s()->data();
|
||||
mThreadSize.width = ptr[0];
|
||||
mThreadSize.height = ptr[1];
|
||||
mThreadSize.depth = ptr[2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
int maxIndex = -1;
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "input") {
|
||||
maxIndex = ALIMAX(maxIndex, attr->i());
|
||||
} else if (attr->key()->str() == "const") {
|
||||
maxIndex = ALIMAX(maxIndex, attr->i());
|
||||
}
|
||||
}
|
||||
for (int i=0; i<extra->attr()->size(); ++i) {
|
||||
auto attr = extra->attr()->GetAs<Attribute>(i);
|
||||
if (attr->key()->str() == "input") {
|
||||
auto list = attr->list()->i()->data();
|
||||
if (list[1] >= 0) {
|
||||
if (0 == list[0]) {
|
||||
mInputBinding[list[1]] = attr->i();
|
||||
} else {
|
||||
mOutputBinding[list[1]] = attr->i();
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (attr->key()->str() == "const") {
|
||||
auto b = attr->tensor();
|
||||
void* result = nullptr;
|
||||
size_t bufferSize = 0;
|
||||
switch (b->dataType()) {
|
||||
case DataType_DT_FLOAT:
|
||||
result = (void*)b->float32s()->Data();
|
||||
bufferSize = b->float32s()->size() * sizeof(float);
|
||||
break;
|
||||
case DataType_DT_INT32:
|
||||
result = (void*)b->int32s()->Data();
|
||||
bufferSize = b->int32s()->size() * sizeof(float);
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
// TODO: Fuse All Const Buffer to One buffer
|
||||
id<MTLBuffer> constBuffer = [context newDeviceBuffer:bufferSize access:CPUWriteOnly];
|
||||
::memcpy([constBuffer contents], result, bufferSize);
|
||||
|
||||
mConstIndides.emplace_back(std::make_pair(attr->i(), std::make_pair(constBuffer, 0)));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
virtual ~MetalFuseV2() = default;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override {
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
for (int i=0; i<inputs.size(); ++i) {
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)inputs[i]->deviceId())->getBuffer() offset:TensorUtils::getDescribe(inputs[i])->extra.offset atIndex:mInputBinding[i]];
|
||||
}
|
||||
for (int i=0; i<outputs.size(); ++i) {
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)outputs[i]->deviceId())->getBuffer() offset:TensorUtils::getDescribe(outputs[i])->extra.offset atIndex:mOutputBinding[i]];
|
||||
}
|
||||
for (int i=0; i<mConstIndides.size(); ++i) {
|
||||
[encoder setBuffer:mConstIndides[i].second.first offset:0 atIndex:mConstIndides[i].first];
|
||||
}
|
||||
[encoder dispatchThreadgroups:mGroupSize threadsPerThreadgroup:mThreadSize];
|
||||
}
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
private:
|
||||
MTLSize mGroupSize;
|
||||
MTLSize mThreadSize;
|
||||
std::vector<int> mInputBinding;
|
||||
std::vector<int> mOutputBinding;
|
||||
std::vector<std::pair<int, std::pair<id<MTLBuffer>, size_t>>> mConstIndides;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
};
|
||||
|
||||
class MetalFuseCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
|
||||
if (_isStandardFuse(op)) {
|
||||
return new MetalFuseV2(backend, op, (int)outputs.size(), (int)inputs.size());
|
||||
}
|
||||
return new MetalFuse(backend, op);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -9,18 +9,18 @@
|
|||
#ifndef MetalGridSample_hpp
|
||||
#define MetalGridSample_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalBackend.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalGridSample : public Execution {
|
||||
class MetalGridSample : public MetalExecution {
|
||||
public:
|
||||
MetalGridSample(Backend *backend, const GridSample* gridSample);
|
||||
virtual ~MetalGridSample() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
namespace MNN {
|
||||
|
||||
MetalGridSample::MetalGridSample(Backend *backend, const GridSample *gridSample)
|
||||
: Execution(backend) {
|
||||
: MetalExecution(backend) {
|
||||
mMode = gridSample->mode();
|
||||
mPaddingMode = gridSample->paddingMode();
|
||||
mAlignCorners = gridSample->alignCorners();
|
||||
|
@ -40,7 +40,7 @@ ErrorCode MetalGridSample::onResize(const std::vector<Tensor *> &inputs,
|
|||
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
mPipeline = [context pipelineWithName:@"grid_sample"];
|
||||
mPipeline = [context pipelineWithName:@"grid_sample" fp16:backend->useFp16InsteadFp32()];
|
||||
|
||||
int batches = ((int *)mParams.contents)[0];
|
||||
int channels = ((int *)mParams.contents)[1];
|
||||
|
@ -52,32 +52,13 @@ ErrorCode MetalGridSample::onResize(const std::vector<Tensor *> &inputs,
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalGridSample::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto encoder = backend->encoder();
|
||||
void MetalGridSample::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)inputs[0]->deviceId())->getBuffer() offset:TensorUtils::getDescribe(inputs[0])->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)inputs[1]->deviceId())->getBuffer() offset:TensorUtils::getDescribe(inputs[1])->extra.offset atIndex:1];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)outputs[0]->deviceId())->getBuffer() offset:TensorUtils::getDescribe(outputs[0])->extra.offset atIndex:2];
|
||||
[encoder setBuffer:mParams offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalGridSampleCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -9,17 +9,16 @@
|
|||
#ifndef MetalInterp_hpp
|
||||
#define MetalInterp_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
#include "MetalDefine.h"
|
||||
#include "MetalExecution.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalInterp : public Execution {
|
||||
class MetalInterp : public MetalExecution {
|
||||
public:
|
||||
MetalInterp(Backend *backend, const Op* op);
|
||||
virtual ~MetalInterp() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
namespace MNN {
|
||||
|
||||
MetalInterp::MetalInterp(Backend *backend, const Op* op)
|
||||
: Execution(backend) {
|
||||
: MetalExecution(backend) {
|
||||
auto interpParam = op->main_as_Interp();
|
||||
auto mBk = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)mBk->context();
|
||||
|
@ -43,12 +43,12 @@ ErrorCode MetalInterp::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
((int *)mShape.contents)[6] = slice;
|
||||
if (mReiszeType == 2 || mReiszeType == 1) {
|
||||
if (2 == mReiszeType) {
|
||||
mPipeline = [context pipelineWithName:@"resize_bilinear"];
|
||||
mPipeline = [context pipelineWithName:@"resize_bilinear" fp16:backend->useFp16InsteadFp32()];
|
||||
} else {
|
||||
mPipeline = [context pipelineWithName:@"resize_nearest"];
|
||||
mPipeline = [context pipelineWithName:@"resize_nearest" fp16:backend->useFp16InsteadFp32()];
|
||||
}
|
||||
} else if (mReiszeType == 3) {
|
||||
mPipeline = [context pipelineWithName:@"resize_cubic"];
|
||||
mPipeline = [context pipelineWithName:@"resize_cubic" fp16:backend->useFp16InsteadFp32()];
|
||||
} else {
|
||||
MNN_ASSERT(false);
|
||||
}
|
||||
|
@ -57,36 +57,15 @@ ErrorCode MetalInterp::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
|
||||
void MetalInterp::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
// encode
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:mShape offset:0 atIndex:2];
|
||||
[encoder setBuffer:mCordTransform offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalInterpCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -9,19 +9,18 @@
|
|||
#ifndef MetalLayerNorm_hpp
|
||||
#define MetalLayerNorm_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalLayerNorm : public Execution {
|
||||
class MetalLayerNorm : public MetalExecution {
|
||||
public:
|
||||
MetalLayerNorm(Backend *backend, const LayerNorm *layernorm);
|
||||
virtual ~MetalLayerNorm() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
private:
|
||||
int mOutside;
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
namespace MNN {
|
||||
|
||||
MetalLayerNorm::MetalLayerNorm(Backend *backend, const LayerNorm *layernorm)
|
||||
: Execution(backend), mGroup(layernorm->group()),
|
||||
: MetalExecution(backend), mGroup(layernorm->group()),
|
||||
mEps(layernorm->epsilon()) {
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
|
||||
|
@ -69,10 +69,10 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
|
|||
}
|
||||
std::sort(mAxis.begin(), mAxis.end());
|
||||
|
||||
for (int i = 0; i < rank - axis.size(); ++i) {
|
||||
for (int i = 0; i < rank - (int)axis.size(); ++i) {
|
||||
mOutside *= input->length(i);
|
||||
}
|
||||
for (int i = rank - axis.size(); i < rank; ++i) {
|
||||
for (int i = rank - (int)axis.size(); i < rank; ++i) {
|
||||
mInside *= input->length(i);
|
||||
}
|
||||
|
||||
|
@ -84,25 +84,17 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
|
|||
|
||||
|
||||
bool parallel = (mInside > 32) && ((mInside & 3) == 0);
|
||||
mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4" : @"layernorm_x1"];
|
||||
mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4" : @"layernorm_x1" fp16:backend->useFp16InsteadFp32()];
|
||||
|
||||
auto inside = parallel ? mInside/4 : mInside;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalLayerNorm::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
|
@ -112,16 +104,6 @@ ErrorCode MetalLayerNorm::onExecute(const std::vector<Tensor *> &inputs, const s
|
|||
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalLayerNormCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -9,18 +9,18 @@
|
|||
#ifndef MetalMatMul_hpp
|
||||
#define MetalMatMul_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalBackend.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalMatMul : public Execution {
|
||||
class MetalMatMul : public MetalExecution {
|
||||
public:
|
||||
MetalMatMul(Backend *backend, const MatMul *matmul);
|
||||
virtual ~MetalMatMul() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -18,7 +18,7 @@ struct matP {
|
|||
int size[4];
|
||||
int stride[4];
|
||||
};
|
||||
MetalMatMul::MetalMatMul(Backend *backend, const MatMul *matmul) : Execution(backend) {
|
||||
MetalMatMul::MetalMatMul(Backend *backend, const MatMul *matmul) : MetalExecution(backend) {
|
||||
mTransposeA = matmul->transposeA();
|
||||
mTransposeB = matmul->transposeB();
|
||||
auto mkbn = static_cast<MetalBackend *>(backend);
|
||||
|
@ -57,23 +57,16 @@ ErrorCode MetalMatMul::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
void MetalMatMul::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
|
||||
Tensor* C = outputs[0];
|
||||
auto e = C->length(0);
|
||||
auto h = C->length(1);
|
||||
|
||||
auto encoder = backend->encoder();
|
||||
if (inputs.size() > 2) {
|
||||
auto bandwidth = [context load:@"matmul_bias" encoder:encoder];
|
||||
auto bandwidth = [context load:@"matmul_bias" encoder:encoder fp16:backend->useFp16InsteadFp32()];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input0->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input0)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input1->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input1)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)inputs[2]->deviceId())->getBuffer() offset:TensorUtils::getDescribe(inputs[2])->extra.offset atIndex:2];
|
||||
|
@ -83,7 +76,7 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
} else {
|
||||
auto bandwidth = [context load:@"matmul" encoder:encoder];
|
||||
auto bandwidth = [context load:@"matmul" encoder:encoder fp16:backend->useFp16InsteadFp32()];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input0->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input0)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input1->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input1)->extra.offset atIndex:1];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:2];
|
||||
|
@ -92,16 +85,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
}
|
||||
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalMatMulCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -9,18 +9,16 @@
|
|||
#ifndef MetalPReLU_hpp
|
||||
#define MetalPReLU_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#import "MetalExecution.hpp"
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalPReLU : public Execution {
|
||||
class MetalPReLU : public MetalExecution {
|
||||
public:
|
||||
MetalPReLU(Backend *backend, const float *slope, int count);
|
||||
virtual ~MetalPReLU() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
private:
|
||||
id<MTLBuffer> mSlope;
|
||||
|
|
|
@ -14,14 +14,15 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalPReLU::MetalPReLU(Backend *backend, const float *slope, int count) : Execution(backend) {
|
||||
MetalPReLU::MetalPReLU(Backend *backend, const float *slope, int count) : MetalExecution(backend) {
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
mSlope = [context newDeviceBuffer:UP_DIV(count, 4) * 4 * sizeof(float) bytes:slope access:CPUWriteOnly];
|
||||
mShareChannel = 1 == count;
|
||||
if (!mShareChannel) {
|
||||
mShape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
|
||||
}
|
||||
mPipeline = [context pipelineWithName:mShareChannel ? @"prelu" : @"prelu_slopes"];
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
mPipeline = [context pipelineWithName:mShareChannel ? @"prelu" : @"prelu_slopes" fp16:mtbn->useFp16InsteadFp32()];
|
||||
}
|
||||
|
||||
ErrorCode MetalPReLU::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
@ -40,17 +41,8 @@ ErrorCode MetalPReLU::onResize(const std::vector<Tensor *> &inputs, const std::v
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalPReLU::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
||||
if(backend->isCommandEncoderSet()) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto func = [=](){
|
||||
void MetalPReLU::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
|
||||
[encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
|
||||
|
@ -59,16 +51,6 @@ ErrorCode MetalPReLU::onExecute(const std::vector<Tensor *> &inputs, const std::
|
|||
[encoder setBuffer:mShape offset:0 atIndex:3];
|
||||
}
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if(backend->isCmdBufferCommit()) {
|
||||
backend->flushEncoder();
|
||||
[context commit_net];
|
||||
}
|
||||
};
|
||||
func();
|
||||
backend->addOpEncoder(func);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalPReLUCreator : public MetalBackend::Creator {
|
||||
|
|
|
@ -9,19 +9,18 @@
|
|||
#ifndef MetalPooling_hpp
|
||||
#define MetalPooling_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MetalExecution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalPooling : public Execution {
|
||||
class MetalPooling : public MetalExecution {
|
||||
public:
|
||||
MetalPooling(Backend *backend, const Pool *pooling);
|
||||
virtual ~MetalPooling() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override;
|
||||
|
||||
private:
|
||||
bool mGlobal;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue