Merge pull request #1258 from alibaba/feature/sync

Feature/sync
This commit is contained in:
jxt1234 2020-11-25 20:10:15 +08:00 committed by GitHub
commit 135f3d0f84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
111 changed files with 2283 additions and 1230 deletions

View File

@ -1,6 +1,6 @@
Pod::Spec.new do |s|
s.name = "MNN"
s.version = "0.1.1"
s.version = "1.1.0"
s.summary = "MNN"
s.description = <<-DESC
@ -46,6 +46,7 @@ Pod::Spec.new do |s|
'schema/current/*.{h}',\
'3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
'source/utils/**/*.{h,c,m,mm,cc,hpp,cpp}',\
'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
@ -57,6 +58,6 @@ Pod::Spec.new do |s|
'express/**/*.{hpp,cpp}'
s.header_mappings_dir = 'include'
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1'}
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1'}
s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
end

View File

@ -77,22 +77,45 @@ public:
MNN_ASSERT(expr->get() != nullptr);
MNN_ASSERT(expr->get()->type() == OpType_BatchNorm);
auto bnPa = expr->get()->main_as_BatchNorm();
auto& inputs = expr->inputs();
int dims = 4;
if (!inputs.empty()) {
auto info = inputs[0]->getInfo();
if (nullptr != info) {
dims = info->dim.size();
}
}
mEps = bnPa->epsilon();
mMomentum = m;
mChannels = bnPa->channels();
std::vector<int> statShape;
std::vector<int> reductionDims;
int channels = mChannels;
if (dims == 2) {
statShape = {1, channels};
mReductionDims = {0};
}
if (dims == 3) {
statShape = {1, channels, 1};
mReductionDims = {0, 2};
}
if (dims == 4) {
statShape = {1, channels, 1, 1};
mReductionDims = {0, 2, 3};
}
MNN_ASSERT(bnPa->biasData()->size() == mChannels);
mBias = _TrainableParam(bnPa->biasData()->data(), {1, mChannels, 1, 1}, NCHW);
mBias = _TrainableParam(bnPa->biasData()->data(), statShape, NCHW);
MNN_ASSERT(bnPa->slopeData()->size() == mChannels);
mScale = _TrainableParam(bnPa->slopeData()->data(), {1, mChannels, 1, 1}, NCHW);
mScale = _TrainableParam(bnPa->slopeData()->data(), statShape, NCHW);
MNN_ASSERT(bnPa->meanData()->size() == mChannels);
mRunningMean = _Const(bnPa->meanData()->data(), {1, mChannels, 1, 1}, NCHW);
mRunningMean = _Const(bnPa->meanData()->data(), statShape, NCHW);
MNN_ASSERT(bnPa->meanData()->size() == mChannels);
mRunningVariance = _Const(bnPa->varData()->data(), {1, mChannels, 1, 1}, NCHW);
mRunningVariance = _Const(bnPa->varData()->data(), statShape, NCHW);
addParameter(mScale);
addParameter(mBias);
mRunningVariancePos = addParameter(mRunningVariance);
mRunningMeanPos = addParameter(mRunningMean);
mReductionDims = {0, 2, 3};
setType("BatchNorm");
}
BatchNormModule(const int channels, const int dims = 4, const float m = 0.99, const float e = 1e-5) {
@ -100,14 +123,16 @@ public:
mEps = e;
mChannels = channels;
MNN_ASSERT((dims == 2) || (dims == 4));
std::vector<int> statShape;
std::vector<int> reductionDims;
if (dims == 2) {
statShape = {1, channels};
mReductionDims = {0};
}
if (dims == 3) {
statShape = {1, channels, 1};
mReductionDims = {0, 2};
}
if (dims == 4) {
statShape = {1, channels, 1, 1};
mReductionDims = {0, 2, 3};

31
package_scripts/linux/build.sh Executable file
View File

@ -0,0 +1,31 @@
## scp package_scripts/linux/build.sh mnnteam@30.6.159.68:/mnt/partition4/CI/scripts
# scp ~/.ssh/id_rsa* mnnteam@30.6.159.68:/mnt/partition4/CI
# ssh mnnteam@30.6.159.68
# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@gitlab.alibaba-inc.com:AliNN/AliNNPrivate.git
# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@gitlab.alibaba-inc.com:AliNN/MNN.git
# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@github.com:alibaba/MNN.git
set -e
usage() {
echo "Usage: $0 -r code_repo"
echo -e "\t-r code repository"
exit 1
}
while getopts 'r:' opt; do
case "$opt" in
r ) CODE_REPO=$OPTARG ;;
h|? ) usage ;;
esac
done
yes | cp /mnt/id_rsa* ~/.ssh 2>/dev/null
cd /root
git clone $CODE_REPO MNN && cd MNN
mkdir MNN
cp -r include/* MNN
./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
# ./package_scripts/linux/build_bridge.sh -o MNN-CPU/py_bridge

View File

View File

@ -0,0 +1,66 @@
# ./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
# ./package_scripts/linux/build_lib.sh -o MNN-CPU-OPENCL/lib -b
# MNN
# |--- Debug
# | |--- libMNN.a
# | |--- libMNN.so
# |
# |--- Release
# |--- libMNN.a
# |--- libMNN.so
set -e
usage() {
echo "Usage: $0 -o path [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
done
# clear and create package directory
./schema/generate.sh
rm -rf $path && mkdir -p $path
mkdir -p $path/Debug
mkdir -p $path/Release
PACKAGE_PATH=$(realpath $path)
CMAKE_ARGS="-DMNN_SEP_BUILD=OFF"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf build && mkdir build
pushd build
# Debug Dynamic MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=ON .. && make -j24
cp libMNN.so $PACKAGE_PATH/Debug
# Debug Static MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j24
cp libMNN.a $PACKAGE_PATH/Debug
# Release Dynamic MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON .. && make -j24
cp libMNN.so $PACKAGE_PATH/Release
# Release Static MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j24
cp libMNN.a $PACKAGE_PATH/Release
popd

View File

@ -0,0 +1,38 @@
# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
# ./package_scripts/linux/build_tools.sh -o MNN-CPU-OPENCL/tools -b
set -e
usage() {
echo "Usage: $0 -o path [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
done
# clear and create package directory
./schema/generate.sh
rm -rf $path && mkdir -p $path
TOOLS_PATH=$(realpath $path)
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_PORTABLE_BUILD=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf build && mkdir build
pushd build
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS .. && make -j8
cp *.out $TOOLS_PATH
popd

View File

@ -0,0 +1,50 @@
# ./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
# ./package_scripts/linux/build_whl.sh -o MNN-CPU-OPENCL/py_whl -b
set -e
usage() {
echo "Usage: $0 -o path [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:v:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
done
./schema/generate.sh
rm -rf $path && mkdir -p $path
PACKAGE_PATH=$(realpath $path)
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf pymnn_build && mkdir pymnn_build
pushd pymnn_build
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
popd
pushd pymnn/pip_package
rm -rf dist && mkdir dist
rm -rf wheelhouse && mkdir wheelhouse
#Compile wheels
for PYBIN in /opt/python/*/bin; do
"${PYBIN}/pip" install -U numpy
"${PYBIN}/python" setup.py bdist_wheel
done
# Bundle external shared libraries into the wheels
for whl in dist/*.whl; do
auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse
done
cp wheelhouse/* $PACKAGE_PATH
popd

View File

@ -1,41 +0,0 @@
# MNN_Linux
# |------- MNN_Linux_lib
# |---------- Dynamic_Library
# |---------- Static_Library
# |------- MNN_Linux_tools
LINUX_PACKAGE_NAME="MNN_Linux"
# clear and create package directory
./schema/generate.sh
LINUX_PACKAGE_PATH="$(pwd)/$LINUX_PACKAGE_NAME"
rm -rf $LINUX_PACKAGE_PATH
mkdir $LINUX_PACKAGE_PATH && cd $LINUX_PACKAGE_PATH
mkdir MNN_Linux_lib && cd MNN_Linux_lib
mkdir Dynamic_Library
mkdir Static_Library
cd ..
mkdir MNN_Linux_tools
cd ..
rm -rf build
mkdir build && cd build
# tools without dependency, static library without sep_build
cmake -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON .. && make -j$(nproc)
pushd ${LINUX_PACKAGE_PATH}
cp ../build/*.out MNN_Linux_tools
cp ../build/MNNConvert MNN_Linux_tools
cp ../build/MNNDump2Json MNN_Linux_tools
cp ../build/OnnxClip MNN_Linux_tools
cp ../build/libMNN.a MNN_Linux_lib/Static_Library
popd
# dynamic library without sep_build
rm CMakeCache.txt
cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release .. && make -j$(nproc)
cd $LINUX_PACKAGE_PATH
cp ../build/libMNN.so MNN_Linux_lib/Dynamic_Library
# auto zip MNN_Linux_lib MNN_Linux_tools
zip -r MNN_Linux_lib.zip MNN_Linux_lib
zip -r MNN_Linux_tools.zip MNN_Linux_tools

View File

View File

@ -0,0 +1,71 @@
# ./package_scripts/mac/build_lib.sh -o MNN-CPU/lib
# ./package_scripts/mac/build_lib.sh -o MNN-CPU-OPENCL/lib -b
# MNN
# |--- Debug
# | |--- Dynamic
# | |--- Static
# |
# |--- Release
# |--- Dynamic
# |--- Static
# Only have MNN.framework
set -e
usage() {
echo "Usage: $0 -o path [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
done
# clear and create package directory
./schema/generate.sh
rm -rf $path && mkdir -p $path
pushd $path
mkdir -p Debug/Dynamic
mkdir -p Debug/Static
mkdir -p Release/Dynamic
mkdir -p Release/Static
popd
PACKAGE_PATH=$(realpath $path)
CMAKE_ARGS="-DMNN_SEP_BUILD=OFF -DMNN_AAPL_FMWK=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf build && mkdir build
pushd build
# Debug Dynamic MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=ON .. && make -j8
cp -R MNN.framework $PACKAGE_PATH/Debug/Dynamic
# Debug Static MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j8
cp -R MNN.framework $PACKAGE_PATH/Debug/Static
# Release Dynamic MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON .. && make -j8
cp -R MNN.framework $PACKAGE_PATH/Release/Dynamic
# Release Static MNN.framework
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j8
cp -R MNN.framework $PACKAGE_PATH/Release/Static
popd

View File

@ -0,0 +1,38 @@
# ./package_scripts/mac/build_tools.sh -o MNN-CPU/tools
# ./package_scripts/mac/build_tools.sh -o MNN-CPU-OPENCL/tools -b
set -e
usage() {
echo "Usage: $0 -o path [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
done
# clear and create package directory
./schema/generate.sh
rm -rf $path && mkdir -p $path
TOOLS_PATH=$(realpath $path)
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_PORTABLE_BUILD=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf build && mkdir build
pushd build
[ -f CMakeCache.txt ] && rm CMakeCache.txt
cmake $CMAKE_ARGS .. && make -j8
cp *.out $TOOLS_PATH
popd

View File

@ -0,0 +1,45 @@
# ./package_scripts/mac/build_whl.sh -o MNN-CPU/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0
# ./package_scripts/mac/build_whl.sh -o MNN-CPU-OPENCL/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0 -b
set -e
usage() {
echo "Usage: $0 -o path -v python_versions [-b]"
echo -e "\t-o package files output directory"
echo -e "\t-v python versions in pyenv"
echo -e "\t-b opencl backend"
exit 1
}
while getopts "o:v:hb" opt; do
case "$opt" in
o ) path=$OPTARG ;;
v ) IFS="," read -a python_versions <<< $OPTARG ;;
b ) opencl=true ;;
h|? ) usage ;;
esac
done
./schema/generate.sh
rm -rf $path && mkdir -p $path
PACKAGE_PATH=$(realpath $path)
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf pymnn_build && mkdir pymnn_build
pushd pymnn_build
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
popd
pushd pymnn/pip_package
rm -rf dist && mkdir dist
for env in $python_versions; do
pyenv global $env
python build_wheel.py
done
cp dist/* $PACKAGE_PATH
popd

View File

@ -1,49 +0,0 @@
# MNN_Mac
# |------- MNN_Mac_lib
# |---------- Dynamic_Library
# |---------- Static_Library
# |---------- MNN.framework
# |------- MNN_Mac_tools
MAC_PACKAGE_NAME="MNN_Mac"
# clear and create package directory
./schema/generate.sh
MAC_PACKAGE_PATH=$(pwd)/$MAC_PACKAGE_NAME
rm -rf $MAC_PACKAGE_PATH
mkdir $MAC_PACKAGE_PATH && cd $MAC_PACKAGE_PATH
mkdir MNN_Mac_lib && cd MNN_Mac_lib
mkdir Dynamic_Library
mkdir Static_Library
cd ..
mkdir MNN_Mac_tools
cd ..
rm -rf build
mkdir build && cd build
# tools without dependency, static library without sep_build
cmake -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON .. && make -j8
pushd ${MAC_PACKAGE_PATH}
cp ../build/*.out MNN_Mac_tools
cp ../build/MNNConvert MNN_Mac_tools
cp ../build/MNNDump2Json MNN_Mac_tools
cp ../build/OnnxClip MNN_Mac_tools
cp ../build/libMNN.a MNN_Mac_lib/Static_Library
popd
# dynamic library without sep_build
rm CMakeCache.txt
cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release .. && make -j8
cd ..
cp build/libMNN.dylib ${MAC_PACKAGE_PATH}/MNN_Mac_lib/Dynamic_Library
# mac framework without sep_build
cd build
rm CMakeCache.txt
cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_AAPL_FMWK=ON .. && make -j8
cd $MAC_PACKAGE_PATH
cp -r ../build/MNN.framework MNN_Mac_lib
# auto zip MNN_Mac_lib MNN_Mac_tools
zip -r MNN_Mac_lib.zip MNN_Mac_lib
zip -r MNN_Mac_tools.zip MNN_Mac_tools

View File

@ -0,0 +1,89 @@
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x64
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x86
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x64 -opencl
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x86 -opencl
# MNN
# |-- Debug
# | |--- MD
# | |--- MT
# | |--- Static
# |
# |-- Release
# |--- MD
# |--- MT
# |--- Static
Param(
[Parameter(Mandatory=$true)][String]$path,
[Switch]$opencl
)
$erroractionpreference = "stop"
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
$PACKAGE_PATH = $(Resolve-Path $path).Path
#clear and create package directory
powershell ./schema/generate.ps1
pushd $PACKAGE_PATH
mkdir -p Debug\MD
mkdir -p Debug\MT
mkdir -p Debug\Static
mkdir -p Release\MD
mkdir -p Release\MT
mkdir -p Release\Static
popd
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF"
if ($opencl) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
}
Remove-Item build -Recurse -ErrorAction Ignore
mkdir build
pushd build
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
ninja
cp MNN.lib $PACKAGE_PATH\Debug\MT
cp MNN.dll $PACKAGE_PATH\Debug\MT
cp MNN.pdb $PACKAGE_PATH\Debug\MT
rm MNN.*
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
ninja
cp MNN.lib $PACKAGE_PATH\Debug\MD
cp MNN.dll $PACKAGE_PATH\Debug\MD
cp MNN.pdb $PACKAGE_PATH\Debug\MD
rm MNN.*
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
ninja
cp MNN.lib $PACKAGE_PATH\Debug\Static
rm MNN.*
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
ninja
cp MNN.lib $PACKAGE_PATH\Release\MT
cp MNN.dll $PACKAGE_PATH\Release\MT
cp MNN.pdb $PACKAGE_PATH\Release\MT
rm MNN.*
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
ninja
cp MNN.lib $PACKAGE_PATH\Release\MD
cp MNN.dll $PACKAGE_PATH\Release\MD
cp MNN.pdb $PACKAGE_PATH\Release\MD
rm MNN.*
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
ninja
cp MNN.lib $PACKAGE_PATH\Release\Static
popd

View File

@ -0,0 +1,39 @@
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x64
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x86
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x64 -opencl
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x86 -opencl
Param(
[Parameter(Mandatory=$true)][String]$path,
[Switch]$opencl
)
$erroractionpreference = "stop"
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
$TOOLS_PATH = $(Resolve-Path $path).Path
powershell ./schema/generate.ps1
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
if ($opencl) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
}
Remove-Item build -Recurse -ErrorAction Ignore
mkdir build
pushd build
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
ninja
cp MNNV2Basic.out.exe $TOOLS_PATH
cp MNNConvert.exe $TOOLS_PATH
cp testModel.out.exe $TOOLS_PATH
cp run_test.out.exe $TOOLS_PATH
cp quantized.out.exe $TOOLS_PATH
cp train.out.exe $TOOLS_PATH
cp benchmark.out.exe $TOOLS_PATH
cp benchmarkExprModels.out.exe $TOOLS_PATH
cp backendTest.out.exe $TOOLS_PATH
popd

View File

@ -0,0 +1,46 @@
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU-OPENCL/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU-OPENCL/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
Param(
[Parameter(Mandatory=$true)][String]$pyenvs,
[Parameter(Mandatory=$true)][String]$path,
[Switch]$x86,
[Switch]$opencl
)
$erroractionpreference = "stop"
$python_versions = $pyenvs.Split(",")
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
$PACKAGE_PATH = $(Resolve-Path $path).Path
$ARGS = ""
if ($x86) {
$ARGS = "--x86"
}
powershell ./schema/generate.ps1
Remove-Item pymnn_build -Recurse -ErrorAction Ignore
mkdir pymnn_build
pushd pymnn/pip_package
Remove-Item dist -Recurse -ErrorAction Ignore
mkdir dist
pushd pymnn_build
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON "
if ($opencl) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
}
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
ninja MNN MNNTrain MNNConvert
popd
Foreach ($env in $python_versions) {
pyenv global $env
python build_wheel.py $ARGS
}
cp dist/* $PACKAGE_PATH
popd

View File

@ -1,65 +0,0 @@
# MNN
# |-- Debug
# | |--- MD
# | |--- MT
# |-- Release
# |--- MD
# |--- MT
$erroractionpreference = "stop"
Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN"
#clear and create package directory
powershell ./schema/generate.ps1
Set-Variable -Name WINDOWS_PACKAGE_PATH -Value "$(pwd)\$WINDOWS_PACKAGE_NAME"
Remove-Item $WINDOWS_PACKAGE_PATH -Recurse -ErrorAction Ignore
mkdir $WINDOWS_PACKAGE_PATH\
cd $WINDOWS_PACKAGE_PATH
mkdir -p Debug\MD
mkdir -p Debug\MT
mkdir -p Release\MD
mkdir -p Release\MT
cd ..
Remove-Item build -Recurse -ErrorAction Ignore
mkdir build
pushd build
# tools without dependency, static library without sep_build
#cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
#ninja
#pushd $WINDOWS_PACKAGE_PATH
#cp ..\build\*.exe MNN_Windows_tools
#cp ..\build\*.pdb MNN_Windows_tools
#cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
#popd
Remove-Item CMakeCache.txt -ErrorAction Ignore
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
ninja
cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MT
cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MT
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MT
Remove-Item CMakeCache.txt -ErrorAction Ignore
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
ninja
cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MD
cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MD
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MD
Remove-Item CMakeCache.txt -ErrorAction Ignore
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
ninja
cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MT
cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MT
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MT
Remove-Item CMakeCache.txt -ErrorAction Ignore
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
ninja
cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MD
cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MD
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MD
popd

View File

@ -118,7 +118,6 @@
489404DE24A2FC2C001E456C /* GeometryReverseSequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */; };
489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A162550FDC800AD896A /* MetalReLU6.metal */; };
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
489D7A692550FDC800AD896A /* MetalDequantize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A182550FDC800AD896A /* MetalDequantize.hpp */; };
489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */; };
489D7A6B2550FDC800AD896A /* MetalReLU.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A1A2550FDC800AD896A /* MetalReLU.metal */; };
489D7A6E2550FDC800AD896A /* MetalROIPooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */; };
@ -156,9 +155,7 @@
489D7A8F2550FDC900AD896A /* MetalScale.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A3E2550FDC800AD896A /* MetalScale.metal */; };
489D7A902550FDC900AD896A /* MetalConvolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */; };
489D7A912550FDC900AD896A /* MetalScale.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A402550FDC800AD896A /* MetalScale.mm */; };
489D7A922550FDC900AD896A /* MetalDequantize.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A412550FDC800AD896A /* MetalDequantize.metal */; };
489D7A932550FDC900AD896A /* MetalFixedPoint.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A422550FDC800AD896A /* MetalFixedPoint.metal */; };
489D7A942550FDC900AD896A /* MetalDequantize.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A432550FDC800AD896A /* MetalDequantize.mm */; };
489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A442550FDC800AD896A /* MetalMatMul.hpp */; };
489D7A962550FDC900AD896A /* MetalConvolution1x1.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */; };
489D7A972550FDC900AD896A /* MetalConvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */; };
@ -299,7 +296,6 @@
92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */; };
92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */; };
92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */; };
92FF025B23AA0B5A00AC97F6 /* CPUPoolGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */; };
92FF025C23AA0B5A00AC97F6 /* CPUGatherV2.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */; };
92FF025D23AA0B5A00AC97F6 /* CPUInterp.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DB23AA0B4800AC97F6 /* CPUInterp.hpp */; };
92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00DC23AA0B4900AC97F6 /* CPUROIPooling.cpp */; };
@ -516,7 +512,6 @@
92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */; };
92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */; };
92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */; };
92FF038023AA0B5A00AC97F6 /* CPUPoolGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */; };
92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */; };
92FF038523AA0B5A00AC97F6 /* CPUMoments.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020623AA0B5300AC97F6 /* CPUMoments.cpp */; };
92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */; };
@ -678,8 +673,6 @@
C43C81E02518944F00A0FF84 /* WinogradHelper.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81D82518944F00A0FF84 /* WinogradHelper.hpp */; };
C43C81E12518944F00A0FF84 /* ConvInt8_1xN.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */; };
C43C81E22518944F00A0FF84 /* ConvInt8_1xN.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */; };
C43C81E32518944F00A0FF84 /* BlstmComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */; };
C43C81E42518944F00A0FF84 /* BlstmComputer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */; };
C43C81EE2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */; };
C43C81F32518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81EF2518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */; };
C43C81F42518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */; };
@ -847,7 +840,6 @@
489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryReverseSequence.cpp; sourceTree = "<group>"; };
489D7A162550FDC800AD896A /* MetalReLU6.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU6.metal; sourceTree = "<group>"; };
489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
489D7A182550FDC800AD896A /* MetalDequantize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalDequantize.hpp; sourceTree = "<group>"; };
489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionGEMM.hpp; sourceTree = "<group>"; };
489D7A1A2550FDC800AD896A /* MetalReLU.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU.metal; sourceTree = "<group>"; };
489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalROIPooling.hpp; sourceTree = "<group>"; };
@ -885,9 +877,7 @@
489D7A3E2550FDC800AD896A /* MetalScale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalScale.metal; sourceTree = "<group>"; };
489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolution.hpp; sourceTree = "<group>"; };
489D7A402550FDC800AD896A /* MetalScale.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalScale.mm; sourceTree = "<group>"; };
489D7A412550FDC800AD896A /* MetalDequantize.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalDequantize.metal; sourceTree = "<group>"; };
489D7A422550FDC800AD896A /* MetalFixedPoint.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalFixedPoint.metal; sourceTree = "<group>"; };
489D7A432550FDC800AD896A /* MetalDequantize.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalDequantize.mm; sourceTree = "<group>"; };
489D7A442550FDC800AD896A /* MetalMatMul.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalMatMul.hpp; sourceTree = "<group>"; };
489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolution1x1.hpp; sourceTree = "<group>"; };
489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionDepthwise.hpp; sourceTree = "<group>"; };
@ -1035,7 +1025,6 @@
92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUTanh.cpp; sourceTree = "<group>"; };
92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuanConvolutionDepthwise.cpp; sourceTree = "<group>"; };
92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolInt8.cpp; sourceTree = "<group>"; };
92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUPoolGrad.hpp; sourceTree = "<group>"; };
92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherV2.hpp; sourceTree = "<group>"; };
92FF00DB23AA0B4800AC97F6 /* CPUInterp.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp.hpp; sourceTree = "<group>"; };
92FF00DC23AA0B4900AC97F6 /* CPUROIPooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUROIPooling.cpp; sourceTree = "<group>"; };
@ -1252,7 +1241,6 @@
92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSigmoid.cpp; sourceTree = "<group>"; };
92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURelu.cpp; sourceTree = "<group>"; };
92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDetectionPostProcess.cpp; sourceTree = "<group>"; };
92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolGrad.cpp; sourceTree = "<group>"; };
92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSetDiff1D.cpp; sourceTree = "<group>"; };
92FF020623AA0B5300AC97F6 /* CPUMoments.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUMoments.cpp; sourceTree = "<group>"; };
92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPULinSpace.cpp; sourceTree = "<group>"; };
@ -1414,8 +1402,6 @@
C43C81D82518944F00A0FF84 /* WinogradHelper.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradHelper.hpp; sourceTree = "<group>"; };
C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvInt8_1xN.hpp; sourceTree = "<group>"; };
C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvInt8_1xN.cpp; sourceTree = "<group>"; };
C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BlstmComputer.cpp; sourceTree = "<group>"; };
C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = BlstmComputer.hpp; sourceTree = "<group>"; };
C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8toFloat32_8x4_Common.S; sourceTree = "<group>"; };
C43C81EF2518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8toFloat32_8x4_Common.S; sourceTree = "<group>"; };
C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Common.S; sourceTree = "<group>"; };
@ -1796,8 +1782,6 @@
92FF025023AA0B5900AC97F6 /* CPUOPRegister.cpp */,
92FF01F123AA0B5200AC97F6 /* CPUPool.cpp */,
92FF00F823AA0B4A00AC97F6 /* CPUPool.hpp */,
92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */,
92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */,
92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */,
92FF00F123AA0B4A00AC97F6 /* CPUPoolInt8.hpp */,
92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */,
@ -1898,7 +1882,6 @@
children = (
489D7A162550FDC800AD896A /* MetalReLU6.metal */,
489D7A172550FDC800AD896A /* MetalReduction.hpp */,
489D7A182550FDC800AD896A /* MetalDequantize.hpp */,
489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */,
489D7A1A2550FDC800AD896A /* MetalReLU.metal */,
489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */,
@ -1936,9 +1919,7 @@
489D7A3E2550FDC800AD896A /* MetalScale.metal */,
489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */,
489D7A402550FDC800AD896A /* MetalScale.mm */,
489D7A412550FDC800AD896A /* MetalDequantize.metal */,
489D7A422550FDC800AD896A /* MetalFixedPoint.metal */,
489D7A432550FDC800AD896A /* MetalDequantize.mm */,
489D7A442550FDC800AD896A /* MetalMatMul.hpp */,
489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */,
489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */,
@ -2324,8 +2305,6 @@
92FF021B23AA0B5600AC97F6 /* compute */ = {
isa = PBXGroup;
children = (
C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */,
C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */,
C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */,
C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */,
C43C81D62518944F00A0FF84 /* ConvInt83x3.cpp */,
@ -2567,12 +2546,10 @@
92FF035823AA0B5A00AC97F6 /* CPUTFQuantizedConv2D.hpp in Headers */,
92FF027223AA0B5A00AC97F6 /* CPUScatterNd.hpp in Headers */,
489D7A902550FDC900AD896A /* MetalConvolution.hpp in Headers */,
92FF025B23AA0B5A00AC97F6 /* CPUPoolGrad.hpp in Headers */,
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */,
92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */,
488F1159247BB2A0008E85C6 /* Arm82Raster.hpp in Headers */,
92FF036623AA0B5A00AC97F6 /* CPUDetectionOutput.hpp in Headers */,
489D7A692550FDC800AD896A /* MetalDequantize.hpp in Headers */,
92FF04BC23AA0BFB00AC97F6 /* NonCopyable.hpp in Headers */,
48FA474B23AA127B00172C3B /* Utils.hpp in Headers */,
C43C82302518951800A0FF84 /* ImageFloatBlitter.hpp in Headers */,
@ -2584,7 +2561,6 @@
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */,
92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
C43C81E42518944F00A0FF84 /* BlstmComputer.hpp in Headers */,
92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
@ -2839,7 +2815,6 @@
92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */,
92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
C43C81E32518944F00A0FF84 /* BlstmComputer.cpp in Sources */,
92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
48A8A61321D101A700C2B9A7 /* ImageSampler.cpp in Sources */,
92FF02D123AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
@ -2874,7 +2849,6 @@
489D7AA42550FDC900AD896A /* MetalROIPooling.mm in Sources */,
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */,
489D7A772550FDC800AD896A /* MetalConvolutionGEMM.mm in Sources */,
489D7A942550FDC900AD896A /* MetalDequantize.mm in Sources */,
92FF031623AA0B5A00AC97F6 /* MNNMatrixMax.S in Sources */,
92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
@ -2983,7 +2957,6 @@
48747D64245D9E33000B9709 /* GeometryTile.cpp in Sources */,
92FF043723AA0B7100AC97F6 /* ShapeDetectionOutput.cpp in Sources */,
92FF042623AA0B7100AC97F6 /* ShapeCosineSimilarity.cpp in Sources */,
489D7A922550FDC900AD896A /* MetalDequantize.metal in Sources */,
92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */,
92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
@ -2994,7 +2967,6 @@
92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */,
92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
92FF038023AA0B5A00AC97F6 /* CPUPoolGrad.cpp in Sources */,
48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */,
92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */,
92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,

View File

@ -15,6 +15,11 @@ def usage():
print(" [--MNNModel MNNMODEL]")
print(" [--fp16 {True,False}]")
print(" [--weightQuantBits {num of bits for weight-only-quant, default:0, which means no quant}]")
print(" [--weightQuantAsymmetric {True,False use asymmetric quant method for weight-only-quant, \
the default method is symmetric quant, which is compatible with old MNN versions. \
you can set this flag to True use asymmetric quant method to improve accuracy of the weight-quant model in some cases, \
but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. \
default: False, which means using SYMMETRIC quant method}]")
print(" [--compressionParamsFile COMPRESSION_PARAMS_PATH]")
def main():
@ -35,6 +40,7 @@ def main():
Boolean to change the mnn usage. If True, the output\
model save data in half_float type")
parser.add_argument("--weightQuantBits", type=int, default=0)
parser.add_argument("--weightQuantAsymmetric", type=bool, default=False)
parser.add_argument("--compressionParamsFile", type=str, default=None,
help="The path of model compression file that stores the int8 calibration \
table for quantization or auxiliary parameters for sparsity.")
@ -80,7 +86,7 @@ def main():
args.compressionParamsFile = ""
Tools.mnnconvert(args.MNNModel, args. modelFile, framework_type,\
args.fp16, args.prototxt, args.weightQuantBits, args.compressionParamsFile)
args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile)
return 0
if __name__ == "__main__":
main()

View File

@ -2,6 +2,12 @@
# Created by ruhuan on 2019.08.31
""" build wheel tool """
from __future__ import print_function
import argparse
parser = argparse.ArgumentParser(description='build pymnn wheel')
parser.add_argument('--x86', dest='x86', action='store_true', default=False,
help='build wheel for 32bit arch, only usable on windows')
args = parser.parse_args()
import os
import shutil
import platform
@ -19,4 +25,4 @@ if __name__ == '__main__':
if IS_WINDOWS:
os.putenv('DISTUTILS_USE_SDK', '1')
os.putenv('MSSdk', '1')
os.system('python setup.py bdist_wheel')
os.system('python setup.py bdist_wheel %s' % ('--x86' if args.x86 else ''))

View File

@ -18,7 +18,10 @@ IS_DARWIN = (platform.system() == 'Darwin')
IS_LINUX = (platform.system() == 'Linux')
BUILD_DIR = 'pymnn_build'
BUILD_TYPE = 'RELEASE'
BUILD_ARCH = 'x64' # x64 or x86
BUILD_ARCH = 'x64'
if '--x86' in sys.argv:
BUILD_ARCH = ''
sys.argv.remove('--x86')
def check_env_flag(name, default=''):
""" check whether a env is set to Yes """
@ -43,7 +46,7 @@ if os.path.isdir('../../schema/private'):
print ('Building with python wheel with package name ', package_name)
version = '1.0.11'
version = '1.1.0'
depend_pip_packages = ['flatbuffers', 'numpy']
if package_name == 'MNN':
README = os.path.join(os.getcwd(), "README.md")
@ -172,6 +175,8 @@ def configure_extension_build():
tools_include_dirs += [os.path.join(root_dir, "source", "core")]
tools_include_dirs += [os.path.join(root_dir, "schema", "current")]
tools_include_dirs += [os.path.join(root_dir, "source")]
if IS_WINDOWS:
tools_include_dirs += [os.path.join(os.environ['Protobuf_SRC_ROOT_FOLDER'], 'src')]
tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lz']

View File

@ -30,9 +30,10 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
PyObject* frameworkType = NULL;
PyObject* fp16 = NULL;
PyObject* weightQuantBits = NULL;
if (!PyArg_ParseTuple(args, "ssOO|sOs", &mnnModel, &modelFile,
PyObject* weightQuantAsymmetric = NULL;
if (!PyArg_ParseTuple(args, "ssOO|sOOs", &mnnModel, &modelFile,
&frameworkType, &fp16, &prototxtFile,
&weightQuantBits, &compressionParamsFile)) {
&weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile)) {
return NULL;
}
struct modelConfig modelPath;
@ -44,6 +45,7 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
modelPath.saveHalfFloat = static_cast<bool>(PyLong_AsLong(fp16));
modelPath.forTraining = false;
modelPath.weightQuantBits = static_cast<int>(PyLong_AsLong(weightQuantBits));
modelPath.weightQuantAsymmetric = static_cast<bool>(PyLong_AsLong(weightQuantAsymmetric));
if(prototxtFile){
modelPath.prototxtFile = std::string(prototxtFile);
}

View File

@ -214,9 +214,10 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
backend()->onCopyBuffer(&mOutputBuffer, output);
} else {
float *dstOrigin = output->host<float>();
int outMaxValNum = mOutMaxVal + 1;
for (int i = 0; i < mNum; ++i) {
float *iptr = srcOrigin + i * mDim * mKeyExtent;
float *optr = dstOrigin + i * mKeyExtent;
float *optr = dstOrigin + i * mKeyExtent * mTopk * outMaxValNum;
for (int k = 0; k < mKeyExtent; ++k) {
// apply threshold
@ -239,9 +240,9 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
// copy index
for (int j = 0; j < mTopk; ++j) {
if (j < sortDim) {
optr[k + j*mKeyExtent] = element_index(vec[j]);
optr[k * outMaxValNum * mTopk + j] = element_index(vec[j]);
} else {
optr[k + j*mKeyExtent] = 0.f;
optr[k * outMaxValNum * mTopk + j] = 0.f;
}
}
@ -249,9 +250,9 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
if (mOutMaxVal) {
for (int j = 0; j < mTopk; ++j) {
if (j < sortDim) {
optr[k + j*mKeyExtent] = element_value(vec[j]);
optr[k * outMaxValNum * mTopk + mTopk + j] = element_value(vec[j]);
} else {
optr[k + j*mKeyExtent] = 0.f;
optr[k * outMaxValNum * mTopk + mTopk + j] = 0.f;
}
}
}

View File

@ -264,7 +264,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
if (biasLength % 4 != 0) {
// Padding to align of 4
biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, 4) * 4}));
bool res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}

View File

@ -19,7 +19,6 @@ extern void ___CPUSizeCreator__OpType_Size__();
extern void ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
extern void ___CPUMatMulCreator__OpType_MatMul__();
extern void ___CPUMomentsCreator__OpType_Moments__();
extern void ___CPUPoolGradCreator__OpType_PoolGrad__();
extern void ___CPUInstanceNormCreator__OpType_InstanceNorm__();
extern void ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
extern void ___CPUWhereCreator__OpType_Where__();
@ -93,7 +92,6 @@ ___CPUSizeCreator__OpType_Size__();
___CPUUnravelIndexCreator__OpType_UnravelIndex__();
___CPUMatMulCreator__OpType_MatMul__();
___CPUMomentsCreator__OpType_Moments__();
___CPUPoolGradCreator__OpType_PoolGrad__();
___CPUInstanceNormCreator__OpType_InstanceNorm__();
___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
___CPUWhereCreator__OpType_Where__();

View File

@ -63,6 +63,8 @@ struct QuanPostTreatParameters {
const int32_t* bias;
int32_t maxValue;
int32_t minValue;
float roundValuePos = 0.5f;
float roundValueNeg = -0.5f;
};
void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);

View File

@ -90,6 +90,7 @@ void MNNFunctionInit() {
gFunc.MNNPackC4ForMatMul_A = _AVX_MNNPackC4ForMatMul_A;
gFunc.MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit;
gFunc.MNNExpC8 = _AVX_MNNExpC8;
if (cpuFlags & libyuv::kCpuHasFMA3) {
gFunc.MNNGemmFloatUnit_4 = _AVX_MNNGemmFloatUnitFMA_4;
gFunc.MNNGemmFloatCommon_4 = _AVX_MNNGemmFloatCommonFMA_4;
@ -323,28 +324,6 @@ void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSiz
float* cache, const float* postParameters, const float* bias) {
return gFunc.MNNPackedMatMulRemain(C, A, B, eSize, parameter, cache, postParameters, bias);
}
/**
void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
auto count = countC8 * 8;
auto param = parameters[0];
float xLimit = 87;
for (int i = 0; i < count; ++i) {
auto x = -source[i];
x = ALIMAX(x, -xLimit);
x = ALIMIN(x, xLimit);
int div = (x * parameters[1]);
int div2 = (div + 127) << 23;
auto xReamin = x - div * param;
float expBasic = *(float*)(&div2);
auto t = xReamin;
auto expRemain =
((((parameters[7] * t + parameters[6]) * t + parameters[5]) * t + parameters[4]) * t + parameters[3]) * t +
parameters[2];
dest[i] = expBasic * expRemain;
}
}
*/
void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
gFunc.MNNExpC8(dest, source, parameters, countC8);
}

View File

@ -124,6 +124,48 @@ static void _postTreat(float* C, size_t eSize, const size_t* parameter, const fl
}
}
void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
auto count = countC8;
auto p0 = _mm256_set1_ps(parameters[0]);
auto p1 = _mm256_set1_ps(parameters[1]);
auto p2 = _mm256_set1_ps(parameters[2]);
auto p3 = _mm256_set1_ps(parameters[3]);
auto p4 = _mm256_set1_ps(parameters[4]);
auto p5 = _mm256_set1_ps(parameters[5]);
auto p6 = _mm256_set1_ps(parameters[6]);
auto p7 = _mm256_set1_ps(parameters[7]);
auto xMax = _mm256_set1_ps(87);
auto xMin = _mm256_set1_ps(-87);
auto basic = _mm256_set1_epi32(1 << 23);
auto temp127 = _mm256_set1_epi32(127);
auto negZero = _mm256_set1_ps(-0.f);
for (int i = 0; i < count; ++i) {
auto x = _mm256_xor_ps(_mm256_loadu_ps(source + i * 8), negZero);
x = _mm256_max_ps(x, xMin);
x = _mm256_min_ps(x, xMax);
auto div = _mm256_mul_ps(x, p1);
auto divInt = _mm256_cvtps_epi32(div);
div = _mm256_cvtepi32_ps(divInt);
auto div2 = _mm256_add_epi32(divInt, temp127);
div2 = _mm256_mullo_epi32(div2, basic);
auto expBasic = _mm256_castsi256_ps(div2);
auto xReamin = _mm256_sub_ps(x, _mm256_mul_ps(div, p0));
auto t = xReamin;
auto c0 = _mm256_mul_ps(p7, t);
auto c1 = _mm256_add_ps(c0, p6);
auto c2 = _mm256_mul_ps(c1, t);
auto c3 = _mm256_add_ps(c2, p5);
auto c4 = _mm256_mul_ps(c3, t);
auto c5 = _mm256_add_ps(c4, p4);
auto c6 = _mm256_mul_ps(c5, t);
auto c7 = _mm256_add_ps(c6, p3);
auto c8 = _mm256_mul_ps(c7, t);
auto c9 = _mm256_add_ps(c8, p2);
auto expRemain = c9;
_mm256_storeu_ps(dest + 8 * i, _mm256_mul_ps(expBasic, expRemain));
}
}
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep) {

View File

@ -90,4 +90,6 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
size_t srcHStep, size_t dstHStep);
void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8);
}

View File

@ -203,8 +203,19 @@ void AVX2GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const fl
}
}
}
#ifdef MNN_X86_USE_ASM
extern "C" {
void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
}
#endif
void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post) {
#ifdef MNN_X86_USE_ASM
size_t strides[3];
strides[0] = src_depth_quad;
strides[1] = dst_step;
strides[2] = dst_depth_quad;
_AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(dst, src, weight, strides, post);
#else
const auto dst_step_tmp = dst_step / sizeof(int8_t);
__m128 zero128 = _mm_set1_ps(0.0f);
__m128 minValue = _mm_set1_ps(post->minValue);
@ -356,4 +367,5 @@ auto d##i = _mm_add_epi32(d##i##0, d##i##1);
d0 = _mm_packs_epi16(d0, d2);
_mm_storeu_ps((float*)dst_x, _mm_castsi128_ps(d0));
}
#endif
}

View File

@ -0,0 +1,309 @@
//
// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
// MNN
//
// Created by MNN on 2020/11/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "../MNNAsmGlobal.h"
.text
.align 4
//struct QuanPostTreatParameters {
// const float* scale;
// const int32_t* bias;
// int32_t maxValue;
// int32_t minValue;
// float roundValuePos = 0.5f;
// float roundValueNeg = -0.5f;
//};
asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
movq 48(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
pushq %r13
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
pushq %r14
pushq %r15
#else
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %r8, %r9
#endif
movq 8(%rcx), %r10 // dst_step
movq 16(%rcx), %r8 // dst_depth_quad
movq (%rcx), %rcx // src_depth_quad
movq (%r9), %r12 // scale
movq 8(%r9), %r15 // bias
// ymm0-ymm1: Src
// ymm2-ymm3: Weight
// ymm4-ymm7: TmpDst
// ymm8-ymm15: Dst Sum
// Last dst save to ymm8-ymm11
cmpq $0, %r8
je End
movq %rsi, %r13
subq $64, %rsp
LoopDz:
movq %rcx, %r11
movq %r13, %rsi
movq %rdx, %r14
subq $1, %r11
vpmovsxbw (%rsi), %ymm0
vpmovsxbw 16(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm8
vpmaddwd %ymm0, %ymm3, %ymm9
vpmaddwd %ymm1, %ymm2, %ymm12
vpmaddwd %ymm1, %ymm3, %ymm13
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm10
vpmaddwd %ymm0, %ymm3, %ymm11
vpmaddwd %ymm1, %ymm2, %ymm14
vpmaddwd %ymm1, %ymm3, %ymm15
addq $64, %rdx
addq $64, %rsi
testq %r11, %r11
je FirstLoopSzEnd
FirstLoopSz:
vpmovsxbw (%rsi), %ymm0
vpmovsxbw 16(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm5, %ymm9, %ymm9
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpaddd %ymm6, %ymm12, %ymm12
vpaddd %ymm7, %ymm13, %ymm13
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm10, %ymm10
vpaddd %ymm5, %ymm11, %ymm11
vpaddd %ymm6, %ymm14, %ymm14
vpaddd %ymm7, %ymm15, %ymm15
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
jne FirstLoopSz
FirstLoopSzEnd:
vphaddd %ymm9, %ymm8, %ymm8
vphaddd %ymm11, %ymm10, %ymm10
vphaddd %ymm13, %ymm12, %ymm12
vphaddd %ymm15, %ymm14, %ymm14
vphaddd %ymm10, %ymm8, %ymm8
vphaddd %ymm14, %ymm12, %ymm9
vmovups %ymm8, (%rsp)
vmovups %ymm9, 32(%rsp)
movq %rcx, %r11
movq %r13, %rsi
movq %r14, %rdx
vpmovsxbw 32(%rsi), %ymm0
vpmovsxbw 48(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm8
vpmaddwd %ymm0, %ymm3, %ymm9
vpmaddwd %ymm1, %ymm2, %ymm12
vpmaddwd %ymm1, %ymm3, %ymm13
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm10
vpmaddwd %ymm0, %ymm3, %ymm11
vpmaddwd %ymm1, %ymm2, %ymm14
vpmaddwd %ymm1, %ymm3, %ymm15
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
je SecondLoopSzEnd
SecondLoopSz:
vpmovsxbw 32(%rsi), %ymm0
vpmovsxbw 48(%rsi), %ymm1
vpmovsxbw (%rdx), %ymm2
vpmovsxbw 16(%rdx), %ymm3
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm5, %ymm9, %ymm9
vpmovsxbw 32(%rdx), %ymm2
vpmovsxbw 48(%rdx), %ymm3
vpaddd %ymm6, %ymm12, %ymm12
vpaddd %ymm7, %ymm13, %ymm13
vpmaddwd %ymm0, %ymm2, %ymm4
vpmaddwd %ymm0, %ymm3, %ymm5
vpmaddwd %ymm1, %ymm2, %ymm6
vpmaddwd %ymm1, %ymm3, %ymm7
vpaddd %ymm4, %ymm10, %ymm10
vpaddd %ymm5, %ymm11, %ymm11
vpaddd %ymm6, %ymm14, %ymm14
vpaddd %ymm7, %ymm15, %ymm15
addq $64, %rdx
addq $64, %rsi
subq $1, %r11
testq %r11, %r11
jne SecondLoopSz
SecondLoopSzEnd:
vphaddd %ymm9, %ymm8, %ymm8
vphaddd %ymm11, %ymm10, %ymm10
vphaddd %ymm13, %ymm12, %ymm12
vphaddd %ymm15, %ymm14, %ymm14
vphaddd %ymm10, %ymm8, %ymm10
vphaddd %ymm14, %ymm12, %ymm11
vmovups (%rsp), %ymm8
vmovups 32(%rsp), %ymm9
Last:
.macro TRANSPOSE x0, x1, x2, x3
// 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
// 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
vperm2f128 $32, \x1, \x0, \x2
vperm2f128 $49, \x1, \x0, \x3
.endm
TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3
vpaddd %ymm0, %ymm1, %ymm0
vpaddd %ymm2, %ymm3, %ymm2
vbroadcastf128 (%r12), %ymm8
vbroadcastf128 (%r15), %ymm9
vpaddd %ymm9, %ymm0, %ymm0
vpaddd %ymm9, %ymm2, %ymm2
vcvtdq2ps %ymm0, %ymm0
vcvtdq2ps %ymm2, %ymm2
vmulps %ymm8, %ymm0, %ymm0
vmulps %ymm8, %ymm2, %ymm2
// zero
vxorps %ymm13, %ymm13, %ymm13
vbroadcastss 24(%r9), %ymm14
vbroadcastss 28(%r9), %ymm15
vbroadcastss 16(%r9), %ymm10
vbroadcastss 20(%r9), %ymm11
// Round
vcmpltps %ymm13, %ymm0, %ymm4
vcmpltps %ymm13, %ymm2, %ymm5
vblendvps %ymm4, %ymm15, %ymm14, %ymm4
vblendvps %ymm5, %ymm15, %ymm14, %ymm5
vaddps %ymm0, %ymm4, %ymm0
vaddps %ymm2, %ymm5, %ymm2
// 3: ROUND to Zero
vroundps $3, %ymm0, %ymm0
vroundps $3, %ymm2, %ymm2
vcvtps2dq %ymm0, %ymm0
vcvtps2dq %ymm2, %ymm2
vpminsd %ymm10, %ymm0, %ymm0
vpminsd %ymm10, %ymm2, %ymm2
vpmaxsd %ymm11, %ymm0, %ymm0
vpmaxsd %ymm11, %ymm2, %ymm2
vpackssdw %ymm2, %ymm0, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vpacksswb %ymm1, %ymm0, %ymm0
addq $16, %r12
addq $16, %r15
vmovups %xmm0, (%rdi)
addq %r10, %rdi
subq $1, %r8
testq %r8, %r8
jne LoopDz
addq $64, %rsp
End:
#ifdef WIN32
popq %r15
popq %r14
popq %r13
popq %r12
popq %rsi
popq %rdi
popq %rbp
#else
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
#endif
// FIXME: if don't vzeroall, it will cause other op slow
vzeroall
retq

View File

@ -14,18 +14,7 @@
#if MNN_METAL_ENABLED
#if MNN_METAL_DEBUG && MNN_METAL_BENCHMARK
#define MNN_PRINT_ENCODER(context, encoder) \
{ \
[context printEncoder:encoder]; \
[context commit]; \
[context wait]; \
}
#elif MNN_METAL_DEBUG
#define MNN_PRINT_ENCODER(context, encoder) [context printEncoder:encoder];
#else
#define MNN_PRINT_ENCODER(context, encoder) ((void)0)
#endif
namespace MNN {
typedef enum {
@ -121,6 +110,10 @@ typedef struct {
threads:(MTLSize)threads
threadsPerGroup:(MTLSize)threadsPerGroup
bandwidth:(MNN::MetalBandwidth)bandwidth;
- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name;
- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) pipeline threads:(MTLSize)threads;
- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t;
#if MNN_METAL_DEBUG
/**
@ -147,6 +140,8 @@ typedef struct {
* @brief print encoder
*/
- (void)printEncoder:(id<MTLCommandEncoder>)encoder;
#endif
@end

View File

@ -176,10 +176,7 @@ using namespace MNN;
}
- (void)wait {
NSArray *buffers = _waitings.copy;
[_waitings removeAllObjects];
for (id<MTLCommandBuffer> buffer in buffers) {
for (id<MTLCommandBuffer> buffer in _waitings) {
if (buffer.status >= MTLCommandBufferStatusCompleted)
continue;
@ -204,6 +201,7 @@ using namespace MNN;
}
#endif
}
[_waitings removeAllObjects];
}
static NSUInteger smallest_log2(NSUInteger integer) {
@ -217,6 +215,83 @@ static NSUInteger smallest_log2(NSUInteger integer) {
return power;
}
- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t {
auto local = [self computeBestGroup:bw threads:t];
auto globalSize = MTLSizeMake(UP_DIV(t.width, local.width), UP_DIV(t.height, local.height), UP_DIV(t.depth, local.depth));
return std::make_pair(globalSize, local);
}
- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) bw threads:(MTLSize)t {
if (bw.maxTotalThreadsPerThreadgroup > 64) {
auto res = MTLSizeMake(8, 8, 8);
int reduceNumber = 0;
if (t.depth < 4) {
res.depth = 1;
reduceNumber++;
}
if (t.width < 4) {
res.width = 1;
reduceNumber++;
}
if (t.height < 4) {
res.height = 1;
reduceNumber++;
}
if (reduceNumber == 0) {
return MTLSizeMake(4, 4, 4);
}
if (reduceNumber == 2) {
if (res.width > 1) {
res.width = 64;
}
if (res.height > 1) {
res.height = 64;
}
if (res.depth > 1) {
res.depth = 64;
}
}
return res;
}
auto pwarp = smallest_log2(bw.threadExecutionWidth);
auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));
auto py = smallest_log2(t.height), sy = (NSUInteger)ceil(log2(t.height));
// accurately match on x
if (px >= pwarp) {
return {bw.threadExecutionWidth, 1, 1};
}
// accurately match on xy
else if (px + py >= pwarp && sx < pwarp / 2) {
NSUInteger x = pow(2, px);
return {x, bw.threadExecutionWidth / x, 1};
}
// similarly match on x
else if (sx >= pwarp) {
return {bw.threadExecutionWidth, 1, 1};
}
// similarly match on xy
else if (sx + sy >= pwarp) {
NSUInteger x = pow(2, sx);
return {x, bw.threadExecutionWidth / x, 1};
}
// on xyz (for most shaders do not protect gid.z, z axis must be accurately match)
auto pz = smallest_log2(t.depth);
auto sz = pz;
if (px + py + pz >= pwarp) {
NSUInteger x = pow(2, px), y = pow(2, py);
return {x, y, bw.threadExecutionWidth / x / y};
} else if (sx + sy + sz >= pwarp) {
NSUInteger x = pow(2, sx), z = pow(2, MIN(sz, pwarp - sx));
return {x, bw.threadExecutionWidth / x / z, z};
} else {
NSUInteger z = pow(2, sz);
return {t.width, t.height, z};
}
}
- (MTLSize)threadsPerGroupWithThreads:(MTLSize)t bandwidth:(MetalBandwidth)bw {
auto pwarp = smallest_log2(bw.threadExecutionWidth);
auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));

View File

@ -27,11 +27,14 @@ public:
id<MTLBuffer> alloc(size_t size, bool seperate = false);
void release(id<MTLBuffer> buffer);
void clear();
float computeSizeInMB() const;
private:
std::map<id<MTLBuffer>, size_t> mAllocated;
std::multimap<size_t, id<MTLBuffer>> mReusableBuffers;
void* mContext = nullptr;
};
virtual float onGetMemoryInMB() override;
MetalRuntime();
virtual ~ MetalRuntime();
virtual Backend* onCreate() const override;
@ -115,18 +118,21 @@ public:
* @param dstTensor destined tensor
* @param encoder command encoder
*/
virtual void onCopyBuffer(const Tensor *srcTensor, const Tensor *dstTensor,
id<MTLComputeCommandEncoder> encoder) const;
void onCopyBuffer(const Tensor *srcTensor, const Tensor *dstTensor,
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
void flushEncoder() const;
id<MTLComputeCommandEncoder> encoder() const;
private:
const MetalRuntime* mRuntime;
std::vector<id<MTLBuffer>> mHoldBuffers;
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
private:
id<MTLBuffer> getHostBuffer(size_t size) const;
void onCopyHostToDevice(const Tensor *src, const Tensor *dst) const;
void onCopyDeviceToHost(const Tensor *src, const Tensor *dst) const;
void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder) const;
void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
};

View File

@ -223,17 +223,17 @@ struct SamplerInfo {
uint4 extent;//dstStride[3]+dstOffset
uint4 imageSize;
};
kernel void blit_float(const device ftype *in [[buffer(0)]],
device ftype *out [[buffer(1)]],
kernel void blit_intx4(const device int4 *in [[buffer(0)]],
device int4 *out [[buffer(1)]],
constant SamplerInfo &info [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
out[int(dstOffset)] = in[int(srcOffset)];
}
}
}
kernel void blit_int(const device int *in [[buffer(0)]],
device int *out [[buffer(1)]],
constant SamplerInfo &info [[buffer(2)]],
@ -265,3 +265,14 @@ kernel void blit_int16(const device short *in [[buffer(0)]],
out[int(dstOffset)] = in[int(srcOffset)];
}
}
kernel void blit_int64(const device short4 *in [[buffer(0)]],
device short4 *out [[buffer(1)]],
constant SamplerInfo &info [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
out[int(dstOffset)] = in[int(srcOffset)];
}
}

View File

@ -23,6 +23,20 @@ MetalRuntime::BufferAllocator::BufferAllocator(void* context) {
MetalRuntime::BufferAllocator::~ BufferAllocator() {
// Do nothing
}
float MetalRuntime::BufferAllocator::computeSizeInMB() const {
float total = 0.0f;
for (auto& iter : mAllocated) {
total += iter.second / 1024.0f / 1024.0f;
}
for (auto& iter : mReusableBuffers) {
total += iter.first / 1024.0f / 1024.0f;
}
return total;
}
float MetalRuntime::onGetMemoryInMB() {
return mStatic->computeSizeInMB() + mDynamic->computeSizeInMB();
}
id<MTLBuffer> MetalRuntime::BufferAllocator::alloc(size_t size, bool seperate) {
if (!seperate) {
@ -164,10 +178,18 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
}
return exe;
}
void MetalBackend::flushEncoder() const {
if (nil != mComputeEncoder) {
[mComputeEncoder endEncoding];
mComputeEncoder = nil;
}
}
void MetalBackend::onExecuteBegin() const {
// do nothing
flushEncoder();
}
void MetalBackend::onExecuteEnd() const {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
[ctx commit];
}
@ -176,7 +198,7 @@ id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
return mRuntime->getHostBuffer(size);
}
std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, const Tensor *tensor) {
MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
int s = 1, c = 1, b = 1;
if (tensor->dimensions() == 4) {
s = tensor->width() * tensor->height();
@ -193,7 +215,6 @@ std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, cons
int z = UP_DIV(c, 4);
// shape
auto shape = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
((int *)shape.contents)[0] = s;
((int *)shape.contents)[1] = c;
((int *)shape.contents)[2] = z;
@ -201,7 +222,7 @@ std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, cons
// threads
MTLSize threads = {(NSUInteger)s, (NSUInteger)b * z, 1};
return std::make_tuple(shape, threads);
return threads;
}
enum MetalCastType : int {
@ -294,7 +315,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
1,
1
};
auto limitBuffer = [ctx newDeviceBuffer:4 * sizeof(int) bytes:&limits access:CPUWriteOnly];
auto limitBuffer = mRuntime->mStatic->alloc(4 * sizeof(int));
::memcpy(limitBuffer.contents, limits, sizeof(limits));
auto encoder = [ctx encoder];
auto bandwidth = [ctx load: @"downcast_float4" encoder:encoder];
[encoder setBuffer:host offset:0 atIndex:0];
@ -304,6 +326,7 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
[encoder endEncoding];
[ctx commit];
[ctx wait];
mRuntime->mStatic->release(limitBuffer);
} else {
[ctx commit];
[ctx wait];
@ -312,7 +335,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
}
// convert
else {
auto shape = getTensorShape(ctx, src);
auto shape = mRuntime->mStatic->alloc(4 * sizeof(int));
auto size = getTensorShape(shape, src);
auto buffer = getHostBuffer(src->elementSize() * sizeof(float));
memcpy(buffer.contents, src->host<float>(), src->size());
auto encoder = [ctx encoder];
@ -322,11 +346,12 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
auto bandwidth = [ctx load:kernel encoder:encoder];
[encoder setBuffer:buffer offset:0 atIndex:0];
[encoder setBuffer:device offset:0 atIndex:1];
[encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
[encoder setBuffer:shape offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
[encoder endEncoding];
[ctx commit];
[ctx wait];
mRuntime->mStatic->release(shape);
}
}
@ -356,7 +381,8 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
1,
1
};
auto limitBuffer = [ctx newDeviceBuffer:4 * sizeof(int) bytes:&limits access:CPUWriteOnly];
auto limitBuffer = mRuntime->mStatic->alloc(4 * sizeof(int));
::memcpy(limitBuffer.contents, limits, sizeof(limits));
[encoder setBuffer:limitBuffer offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:{sizeC4, 1, 1} bandwidth:bandwidth];
[encoder endEncoding];
@ -364,6 +390,7 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
[ctx wait];
memcpy(dst->host<float>(), buffer.contents, dst->size());
mRuntime->mStatic->release(limitBuffer);
} else {
[ctx commit];
[ctx wait];
@ -372,7 +399,8 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
}
// convert
else {
auto shape = getTensorShape(ctx, src);
auto shape = mRuntime->mStatic->alloc(4 * sizeof(int));
auto size = getTensorShape(shape, src);
auto buffer = getHostBuffer(dst->size());
auto encoder = [ctx encoder];
auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, Up);
@ -381,12 +409,13 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
auto bandwidth = [ctx load:kernel encoder:encoder];
[encoder setBuffer:device offset:0 atIndex:0];
[encoder setBuffer:buffer offset:0 atIndex:1];
[encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
[encoder setBuffer:shape offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
[encoder endEncoding];
[ctx commit];
[ctx wait];
memcpy(dst->host<float>(), buffer.contents, dst->size());
mRuntime->mStatic->release(shape);
}
}
@ -406,7 +435,7 @@ void MetalBackend::AutoBuffer::reset(size_t length) {
void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
id<MTLComputeCommandEncoder> encoder) const {
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
auto ctx = (__bridge MNNMetalContext *)context();
auto standalone = encoder == nil;
encoder = encoder ?: [ctx encoder];
@ -426,13 +455,21 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
else {
auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, None);
MNN_ASSERT(kernel != nil); // unsupported sfmt to dfmt
bool needRelease = false;
if (shape == nil) {
shape = mRuntime->mStatic->alloc(4 * sizeof(int));
needRelease = true;
}
auto shape = getTensorShape(ctx, src);
auto size = getTensorShape(shape, src);
auto bandwidth = [ctx load:kernel encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)(src->buffer().device) offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)(dst->buffer().device) offset:0 atIndex:1];
[encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
[encoder setBuffer:shape offset:0 atIndex:2];
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
if (needRelease) {
mRuntime->mStatic->release(shape);
}
}
if (standalone) {
@ -442,14 +479,23 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
onCopyBuffer(src, dst, nil);
flushEncoder();
onCopyBuffer(src, dst, nil, nil);
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder) const {
id<MTLComputeCommandEncoder> MetalBackend::encoder() const {
if (nil == mComputeEncoder) {
auto ctx = (__bridge MNNMetalContext *)context();
mComputeEncoder = [ctx encoder];
}
return mComputeEncoder;
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);
if (!src->buffer().host && !dst->buffer().host) {
onCopyDeviceToDevice(src, dst, encoder);
onCopyDeviceToDevice(src, dst, encoder, shape);
} else if (!src->buffer().host && dst->buffer().host) {
onCopyDeviceToHost(src, dst);
} else if (src->buffer().host && !dst->buffer().host) {

View File

@ -20,9 +20,12 @@ public:
MetalBinary(Backend *backend, std::string type);
virtual ~MetalBinary() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
std::string mKernelName;
id<MTLBuffer> mConstBuffer;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -16,8 +16,6 @@ struct binary_op_shape {
int i1stride;
int output_data_count;
int output_width;
int output_size;
int output_dimensions;
};
#define define_op(op) \
kernel void binary_##op##_x1(const device ftype *in0 [[buffer(0)]],\

View File

@ -15,33 +15,39 @@
namespace MNN {
MetalBinary::MetalBinary(Backend *backend, std::string type) : Execution(backend) {
mKernelName = "binary_" + type + "_x1";
auto mKernelName = "binary_" + type + "_x1";
auto mtbn = static_cast<MetalBackend *>(backend);
auto context = (__bridge MNNMetalContext *)mtbn->context();
mConstBuffer = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
mPipeline = [context pipelineWithName:kn];
}
ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
const int input0_data_count = (int)input0->elementSize();
const int input1_data_count = (int)input1->elementSize();
auto shape = [context newDeviceBuffer:6 * sizeof(int) access:CPUWriteOnly];
auto encoder = [context encoder];
int outdatacount = output->elementSize();
((int *)shape.contents)[0] = input0_data_count == 1 ? 0 : 1;
((int *)shape.contents)[1] = input1_data_count == 1 ? 0 : 1;
((int *)shape.contents)[2] = outdatacount;
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
((int *)mConstBuffer.contents)[0] = input0_data_count == 1 ? 0 : 1;
((int *)mConstBuffer.contents)[1] = input1_data_count == 1 ? 0 : 1;
((int *)mConstBuffer.contents)[2] = outdatacount;
((int *)mConstBuffer.contents)[3] = 0;
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outdatacount, 1, 1)];
return NO_ERROR;
}
auto bandwidth = [context load:kn encoder:encoder];
ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
[encoder setBuffer:shape offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:{ (NSUInteger) outdatacount, 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder setBuffer:mConstBuffer offset:0 atIndex:3];
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}

View File

@ -37,15 +37,13 @@ ErrorCode MetalCast::onExecute(const std::vector<Tensor *> &inputs, const std::v
return NOT_SUPPORT;
}
auto encoder = [context encoder];
auto encoder = backend->encoder();
auto bandwidth = [context load:kernel encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) output->elementSize(), (NSUInteger)1, (NSUInteger)1 }
bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
static DataType _mapDataType(DataType src) {

View File

@ -24,6 +24,7 @@ protected:
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
private:
const MNN::Op *mOp = nullptr;
int mThreadgroupMemory = 0;
bool mLocalPreferred = false;
bool isThreadgroupLocalPreferred(const Tensor *input, const Tensor *output);

View File

@ -17,6 +17,7 @@
namespace MNN {
MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op) {
mOp = op;
loadWeight(op->main_as_Convolution2D());
}
@ -52,17 +53,9 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
auto input = inputs[0], output = outputs[0];
auto iw = input->width(), ih = input->height(), igz = UP_DIV(input->channel(), 4) / mGroups;
auto ow = output->width(), oh = output->height(), ogz = UP_DIV(output->channel(), 4) / mGroups;
// pad mode support
int padX = mPadX, padY = mPadY;
if (mPadMode == PadMode_SAME) {
int kernelWidthSize = (mKernelX - 1) * mDilateX + 1;
int kernelHeightSize = (mKernelY - 1) * mDilateY + 1;
int pw = (ow - 1) * mStrideX + kernelWidthSize - iw;
int ph = (oh - 1) * mStrideY + kernelHeightSize - ih;
padX = pw / 2;
padY = ph / 2;
}
auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
auto padX = pads.first;
auto padY = pads.second;
// update threadgroup memory if needed
int stepSlices = igz;
@ -112,7 +105,7 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
auto ib = iw * ih * iz * 4 * unit, ig = ib / mGroups;
auto ob = ow * oh * oz * 4 * sizeof(metal_float), og = ob / mGroups;
auto encoder = [context encoder];
auto encoder = backend->encoder();
auto bandwidth = (MetalBandwidth){};
MTLSize threads = {};
if (mLocalPreferred) {
@ -144,7 +137,6 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
}
}
}
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
@ -152,6 +144,12 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
class MetalConvolutionCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
auto param = op->main_as_Convolution2D();
if (param->quanParameter() != nullptr) {
if (param->quanParameter()->has_scaleInt()) {
return nullptr;
}
}
if (op->type() == OpType_Convolution) {
auto conv = op->main_as_Convolution2D();
auto input = inputs[0];

View File

@ -23,6 +23,9 @@ public:
protected:
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
private:
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -51,27 +51,73 @@ kernel void conv1x1_g1z4(const device ftype4 *in [[buffer(0)]],
const device ftype4x4 *wt [[buffer(3)]],
const device ftype4 *biasTerms [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if ((int)gid.x >= cst.output_size || (int)gid.y * CONV_UNROLL >= cst.output_slice || (int)gid.z >= cst.batch) return;
if ((int)gid.x * CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;
int uz = gid.y * CONV_UNROLL;
auto xy_wt0 = wt + uz * cst.input_slice;
auto xy_wt1 = uz + 1 < cst.output_slice ? xy_wt0 + cst.input_slice : nullptr;
auto xy_wt2 = uz + 2 < cst.output_slice ? xy_wt1 + cst.input_slice : nullptr;
auto xy_wt3 = uz + 3 < cst.output_slice ? xy_wt2 + cst.input_slice : nullptr;
auto xy_in = in + (int)gid.z * cst.input_slice * cst.input_size + (int)gid.x;
auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + uz * cst.output_size + (int)gid.x;
float4 result0 = 0, result1 = 0, result2 = 0, result3 = 0;
for (auto z = 0; z < cst.input_slice; z++, xy_in += cst.input_size) {
auto in4 = *xy_in;
/* true */ result0 += float4(in4 * xy_wt0[z]);
if (xy_wt1) result1 += float4(in4 * xy_wt1[z]);
if (xy_wt2) result2 += float4(in4 * xy_wt2[z]);
if (xy_wt3) result3 += float4(in4 * xy_wt3[z]);
int rx = gid.x * CONV_UNROLL;
int uz = gid.y;
auto xy_wt = wt + uz * cst.input_slice;
auto xy_in0 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 0;
auto xy_in1 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 1;
auto xy_in2 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 2;
auto xy_in3 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 3;
auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + uz * cst.output_size + rx;
auto biasValue = float4(biasTerms[uz]);
float4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue;
int computeSize = min(cst.output_size - rx, CONV_UNROLL);
if (computeSize == CONV_UNROLL) {
for (auto z = 0; z < cst.input_slice; z++) {
auto in40 = *xy_in0;
auto in41 = *xy_in1;
auto in42 = *xy_in2;
auto in43 = *xy_in3;
auto w = xy_wt[z];
result0 += float4(in40 * w);
result1 += float4(in41 * w);
result2 += float4(in42 * w);
result3 += float4(in43 * w);
xy_in0 += cst.input_size;
xy_in1 += cst.input_size;
xy_in2 += cst.input_size;
xy_in3 += cst.input_size;
}
} else if (computeSize == 3) {
for (auto z = 0; z < cst.input_slice; z++) {
auto in40 = *xy_in0;
auto in41 = *xy_in1;
auto in42 = *xy_in2;
auto w = xy_wt[z];
result0 += float4(in40 * w);
result1 += float4(in41 * w);
result2 += float4(in42 * w);
xy_in0 += cst.input_size;
xy_in1 += cst.input_size;
xy_in2 += cst.input_size;
}
} else if (computeSize == 2) {
for (auto z = 0; z < cst.input_slice; z++) {
auto in40 = *xy_in0;
auto in41 = *xy_in1;
auto w = xy_wt[z];
result0 += float4(in40 * w);
result1 += float4(in41 * w);
xy_in0 += cst.input_size;
xy_in1 += cst.input_size;
}
} else {
for (auto z = 0; z < cst.input_slice; z++) {
auto in40 = *xy_in0;
auto w = xy_wt[z];
result0 += float4(in40 * w);
xy_in0 += cst.input_size;
}
}
/* true */ *xy_out = activate(ftype4(result0 + float4(biasTerms[uz + 0])), cst.activation);
if (xy_wt1) { xy_out += cst.output_size; *xy_out = activate(ftype4(result1 + float4(biasTerms[uz + 1])), cst.activation); }
if (xy_wt2) { xy_out += cst.output_size; *xy_out = activate(ftype4(result2 + float4(biasTerms[uz + 2])), cst.activation); }
if (xy_wt3) { xy_out += cst.output_size; *xy_out = activate(ftype4(result3 + float4(biasTerms[uz + 3])), cst.activation); }
/* true */ *xy_out = activate(ftype4(result0), cst.activation);
if (computeSize > 1) {xy_out[1] = activate(ftype4(result1), cst.activation); }
if (computeSize > 2) {xy_out[2] = activate(ftype4(result2), cst.activation); }
if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
}

View File

@ -33,38 +33,35 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
auto input = inputs[0], output = outputs[0];
auto is = input->width() * input->height(), iz = UP_DIV(input->channel(), 4), igz = iz / mGroups;
auto os = output->width() * output->height(), oz = UP_DIV(output->channel(), 4), ogz = oz / mGroups;
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
// create const buffer
int constants[] = {is, igz, iz, os, ogz, oz, output->batch(), mActivationType};
mConstBuffer.reset(sizeof(constants));
::memcpy(mConstBuffer.buffer().contents, constants, sizeof(constants));
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();;
if (mGroups == 1 && (w * h >= 32)) {
mPipeline = [context pipelineWithName:@"conv1x1_g1z4"];
mThreads = [context computeBestGroupAndLocal:mPipeline threads:{(NSUInteger)UP_DIV(w * h, 4), (NSUInteger)z, (NSUInteger)b}];
} else {
mPipeline = [context pipelineWithName:@"conv1x1"];
mThreads = [context computeBestGroupAndLocal:mPipeline threads:{(NSUInteger)w * h, (NSUInteger)z, (NSUInteger)b}];
}
return NO_ERROR;
}
ErrorCode MetalConvolution1x1::onFloat(const Tensor *input, const Tensor *output) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();;
auto encoder = [context encoder];
auto bandwidth = (MetalBandwidth){};
MTLSize threads = {};
if (mGroups == 1 && (w * h * b >= 32 ? z >= 16 : z >= 128)) {
bandwidth = [context load:@"conv1x1_g1z4" encoder:encoder];
threads = {(NSUInteger)w * h, (NSUInteger)UP_DIV(z, 4), (NSUInteger)b};
} else {
bandwidth = [context load:@"conv1x1" encoder:encoder];
threads = {(NSUInteger)w * h, (NSUInteger)z, (NSUInteger)b};
}
bandwidth.zAxisProtected = YES;
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
[encoder setBuffer:mWeight offset:0 atIndex:3];
[encoder setBuffer:mBias offset:0 atIndex:4];
[context dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}
} // namespace MNN

View File

@ -23,6 +23,9 @@ public:
protected:
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) override;
private:
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -60,24 +60,24 @@ ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &input
mActivationType};
mConstBuffer.reset(sizeof(constants));
::memcpy(mConstBuffer.buffer().contents, constants, sizeof(constants));
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
mPipeline = [context pipelineWithName:@"conv_depthwise"];
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w, h, z*b)];
return NO_ERROR;
}
ErrorCode MetalConvolutionDepthwise::onFloat(const Tensor *input, const Tensor *output) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
auto encoder = [context encoder];
auto bandwidth = [context load:@"conv_depthwise" encoder:encoder];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
[encoder setBuffer:mWeight offset:0 atIndex:3];
[encoder setBuffer:mBias offset:0 atIndex:4];
[context dispatchEncoder:encoder threads:{ (NSUInteger)w, (NSUInteger)h, (NSUInteger)z * b } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}
@ -112,6 +112,9 @@ id<MTLBuffer> MetalConvolutionDepthwise::weightForFloat(int group, int oc, int i
class MetalConvolutionDepthwiseCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
if (inputs.size() > 1) {
return nullptr;
}
return new MetalConvolutionDepthwise(backend, op);
}
};

View File

@ -30,6 +30,12 @@ private:
id<MTLBuffer> mShapeBuffer = nil;
std::shared_ptr<Tensor> mTempInput;
std::shared_ptr<Tensor> mTempOutput;
id<MTLComputePipelineState> mPipelineGEMM;
std::pair<MTLSize, MTLSize> mGemm;
id<MTLComputePipelineState> mPipelineIm2Col;
std::pair<MTLSize, MTLSize> mIm2Col;
id<MTLComputePipelineState> mPipelineCol2Im;
std::pair<MTLSize, MTLSize> mCol2Im;
};
} // namespace MNN

View File

@ -21,7 +21,6 @@ bool MetalConvolutionGEMM::isValid(const Convolution2D *conv, const Tensor *inpu
if (kx == 1 || ky == 1 || common->group() != 1) {
return false;
}
auto oc = common->outputCount();
if (oc <= 64) {
return false;
@ -106,6 +105,14 @@ ErrorCode MetalConvolutionGEMM::onResize(const std::vector<Tensor *> &inputs, co
}
backend->onReleaseBuffer(mTempInput.get(), Backend::DYNAMIC);
backend->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
mPipelineGEMM = [context pipelineWithName:@"matmul4x4"];
mPipelineIm2Col = [context pipelineWithName:@"conv_im2col"];
mPipelineCol2Im = [context pipelineWithName:@"conv_col2im"];
NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
NSUInteger gh = UP_DIV(output->channel(), 4);
mGemm = [context computeBestGroupAndLocal:mPipelineGEMM threads:{gw, gh, 1}];
mIm2Col = [context computeBestGroupAndLocal:mPipelineIm2Col threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)iz*ob}];
mCol2Im = [context computeBestGroupAndLocal:mPipelineCol2Im threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)oz*ob}];
return NO_ERROR;
}
@ -115,42 +122,31 @@ ErrorCode MetalConvolutionGEMM::onExecute(const std::vector<Tensor *> &inputs, c
ErrorCode MetalConvolutionGEMM::onFloat(const Tensor *input, const Tensor *output) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto encoder = [context encoder];
auto encoder = backend->encoder();
{ // im2col
NSUInteger iz = UP_DIV(input->channel(), 4), ib = input->batch();
NSUInteger ow = output->width(), oh = output->height();
auto bandwidth = [context load:@"conv_im2col" encoder:encoder];
[encoder setComputePipelineState:mPipelineIm2Col];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempInput->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
[context dispatchEncoder:encoder threads:{ ow, oh, iz *ib } bandwidth:bandwidth];
[encoder dispatchThreadgroups:mIm2Col.first threadsPerThreadgroup:mIm2Col.second];
}
{ // gemm
NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
NSUInteger gh = UP_DIV(output->channel(), 4);
auto bandwidth = [context load:@"matmul4x4" encoder:encoder];
[encoder setComputePipelineState:mPipelineGEMM];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempInput->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempOutput->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mWeight offset:0 atIndex:2];
[encoder setBuffer:mShapeBuffer offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:{ gw, gh, 1 } bandwidth:bandwidth];
[encoder dispatchThreadgroups:mGemm.first threadsPerThreadgroup:mGemm.second];
}
{ // col2im
NSUInteger ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4), ob = output->batch();
auto bandwidth = [context load:@"conv_col2im" encoder:encoder];
[encoder setComputePipelineState:mPipelineCol2Im];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempOutput->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mBias offset:0 atIndex:2];
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:{ ow, oh, oz *ob } bandwidth:bandwidth];
[encoder dispatchThreadgroups:mCol2Im.first threadsPerThreadgroup:mCol2Im.second];
}
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}

View File

@ -131,7 +131,7 @@ ErrorCode MetalConvolutionWinograd::onResize(const std::vector<Tensor *> &inputs
ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *output) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto encoder = [context encoder];
auto encoder = backend->encoder();
{ // transform
auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_source2_3_1" : @"winograd_transform_source2_5_1" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
@ -155,7 +155,6 @@ ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *o
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:mOutputTransformThreads bandwidth:bandwidth];
}
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;

View File

@ -39,10 +39,9 @@ private:
id<MTLBuffer> mWeight = nil;
id<MTLBuffer> mBias = nil;
id<MTLBuffer> mConstBuffer = nil;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
private:
ErrorCode onDepthwise(const Tensor *input, const Tensor *output);
ErrorCode onDeconv(const Tensor *input, const Tensor *output);
};
} // namespace MNN

View File

@ -36,6 +36,7 @@ struct deconv_constants {
int delta_iy;
int delta_ix;
int has_bias;
int batch;
};
kernel void deconv(const device ftype4 *in [[buffer(0)]],
@ -44,7 +45,7 @@ kernel void deconv(const device ftype4 *in [[buffer(0)]],
const device ftype4x4 *wt [[buffer(3)]],
const device ftype4 *biasTerms [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height) return;
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
short b = gid.z / cst.output_slice;
short o = gid.z % cst.output_slice;
@ -86,9 +87,9 @@ kernel void deconv_depthwise(const device ftype4 *in [[buffer(0)]],
const device ftype4 *wt [[buffer(3)]],
const device ftype4 *biasTerms [[buffer(4)]],
ushort3 gid [[thread_position_in_grid]]) {
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height) return;
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
float4 result = float4(biasTerms[(short)gid.z]);
float4 result = float4(biasTerms[(short)(gid.z % cst.input_slice)]);
short oy = (short)gid.y + cst.pad_y;
short ox = (short)gid.x + cst.pad_x;

View File

@ -140,6 +140,11 @@ MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : Ex
}
mWeight = weightForDeconv(context, mDepthwise, deconv, qnt.get());
mBias = biasForDeconv(context, deconv);
if (mDepthwise) {
mPipeline = [context pipelineWithName:@"deconv_depthwise"];
} else {
mPipeline = [context pipelineWithName:@"deconv"];
}
}
ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -148,6 +153,7 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
auto input = inputs[0], output = outputs[0];
int iw = input->width(), ih = input->height(), iz = UP_DIV(input->channel(), 4);
int ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4);
int ob = output->batch();
// pad mode support
int padX = mPadX, padY = mPadY;
@ -185,63 +191,26 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
deltaKy * mDilateY / mStrideY,
deltaKx * mDilateX / mStrideX,
mBias.length > 0,
ob
};
mConstBuffer = [context newDeviceBuffer:sizeof(consts) bytes:consts access:CPUWriteOnly];
return NO_ERROR;
}
ErrorCode MetalDeconvolution::onDepthwise(const Tensor *input, const Tensor *output) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
int unit = sizeof(metal_float);
auto iw = input->width(), ih = input->height(), iz = UP_DIV(input->channel(), 4), ib = iw * ih * iz * 4 * unit;
auto ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4), ob = ow * oh * oz * 4 * unit;
// run
auto encoder = [context encoder];
auto bandwidth = [context load:@"deconv_depthwise" encoder:encoder];
for (int b = 0; b < input->batch(); b++) {
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:b * ib atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:b * ob atIndex:1];
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
[encoder setBuffer:mWeight offset:0 atIndex:3];
[encoder setBuffer:mBias offset:0 atIndex:4];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz }
bandwidth:bandwidth];
}
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
ErrorCode MetalDeconvolution::onDeconv(const Tensor *input, const Tensor *output) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
int ow = output->width(), oh = output->height(), oc = output->channel(), oz = UP_DIV(oc, 4), ob = output->batch();
// run
auto encoder = [context encoder];
auto bandwidth = [context load:@"deconv" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
[encoder setBuffer:mWeight offset:0 atIndex:3];
[encoder setBuffer:mBias offset:0 atIndex:4];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob }
bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob)];
return NO_ERROR;
}
ErrorCode MetalDeconvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = inputs[0], output = outputs[0];
if (mDepthwise) {
return onDepthwise(input, output);
} else {
return onDeconv(input, output);
}
auto backend = static_cast<MetalBackend *>(this->backend());
// run
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
[encoder setBuffer:mWeight offset:0 atIndex:3];
[encoder setBuffer:mBias offset:0 atIndex:4];
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}
class MetalDeconvolutionCreator : public MetalBackend::Creator {

View File

@ -1,39 +0,0 @@
//
// MetalDequantize.hpp
// MNN
//
// Created by MNN on 2019/01/30.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MetalDequantize_hpp
#define MetalDequantize_hpp
#import "core/Execution.hpp"
#import "MNN_generated.h"
#import "MetalDefine.h"
#if MNN_METAL_ENABLED
namespace MNN {
class MetalDequantize : public Execution {
public:
MetalDequantize(Backend *backend, const Dequantize *dq);
virtual ~MetalDequantize() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
ErrorCode onTFLite(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onMinCombined(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onMinFirst(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onScaled(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
private:
ModeFormat mModeFormat;
DataType mType;
QuantizeMode mMode;
};
} // namespace MNN
#endif /* MNN_METAL_ENABLED */
#endif /* MetalDequantize_hpp */

View File

@ -1,186 +0,0 @@
//
// MetalDequantize.metal
// MNN
//
// Created by MNN on 2018/11/13.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include <metal_stdlib>
#include "MetalDefine.metal"
using namespace metal;
kernel void dequantize_min_combined_uint8(const device uchar *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float factor = (max_range - min_range) / UINT8_MAX;
output[int(gid)] = input0[int(gid)] * factor + min_range;
}
kernel void dequantize_min_combined_uint16(const device ushort *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float factor = (max_range - min_range) / UINT16_MAX;
output[int(gid)] = input0[int(gid)] * factor + min_range;
}
kernel void dequantize_min_combined_int8(const device char *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float half_range = (INT8_MAX - INT8_MIN + 1) / 2.f;
float factor = (max_range - min_range) / (INT8_MAX - INT8_MIN);
output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
}
kernel void dequantize_min_combined_int16(const device short *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float half_range = (INT16_MAX - INT16_MIN + 1) / 2.f;
float factor = (max_range - min_range) / (INT16_MAX - INT16_MIN);
output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
}
kernel void dequantize_min_combined_int32(const device int *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float half_range = (INT32_MAX - INT32_MIN + 1) / 2.f;
float factor = (max_range - min_range) / (INT32_MAX - INT32_MIN);
output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
}
kernel void dequantize_min_first_uint8(const device uchar *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float mask = float(uchar(~0));
float range_scale = (max_range - min_range) / mask;
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
output[int(gid)] = input0[int(gid)] * range_scale + range_min_rounded;
}
kernel void dequantize_min_first_uint16(const device ushort *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float mask = float(ushort(~0));
float range_scale = (max_range - min_range) / mask;
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
output[int(gid)] = input0[int(gid)] * range_scale + range_min_rounded;
}
kernel void dequantize_min_first_int8(const device char *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float mask = float(uchar(~0));
float range_scale = (max_range - min_range) / mask;
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
float lowest_quantized = float(INT8_MIN);
float result_add = range_min_rounded - lowest_quantized * range_scale;
output[int(gid)] = input0[int(gid)] * range_scale + result_add;
}
kernel void dequantize_min_first_int16(const device short *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float mask = float(ushort(~0));
float range_scale = (max_range - min_range) / mask;
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
float lowest_quantized = float(INT16_MIN);
float result_add = range_min_rounded - lowest_quantized * range_scale;
output[int(gid)] = input0[int(gid)] * range_scale + result_add;
}
kernel void dequantize_min_first_int32(const device int *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float mask = float(uint(~0));
float range_scale = (max_range - min_range) / mask;
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
float lowest_quantized = float(INT32_MIN);
float result_add = range_min_rounded - lowest_quantized * range_scale;
output[int(gid)] = input0[int(gid)] * range_scale + result_add;
}
kernel void dequantize_scaled_uint8(const device uchar *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float max_range = input2[0];
float factor = max_range / UINT8_MAX;
output[int(gid)] = input0[int(gid)] * factor;
}
kernel void dequantize_scaled_uint16(const device ushort *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float max_range = input2[0];
float factor = max_range / UINT16_MAX;
output[int(gid)] = input0[int(gid)] * factor;
}
kernel void dequantize_scaled_int8(const device char *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float factor = max(min_range / INT8_MIN, max_range / INT8_MAX);
output[int(gid)] = input0[int(gid)] * factor;
}
kernel void dequantize_scaled_int16(const device short *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float factor = max(min_range / INT16_MIN, max_range / INT16_MAX);
output[int(gid)] = input0[int(gid)] * factor;
}
kernel void dequantize_scaled_int32(const device int *input0 [[buffer(0)]],
const device ftype *input1 [[buffer(1)]],
const device ftype *input2 [[buffer(2)]],
device ftype *output [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
float min_range = input1[0];
float max_range = input2[0];
float factor = max(min_range / INT32_MIN, max_range / INT32_MAX);
output[int(gid)] = input0[int(gid)] * factor;
}

View File

@ -1,151 +0,0 @@
//
// MetalDequantize.mm
// MNN
//
// Created by MNN on 2019/01/30.
// Copyright © 2018, Alibaba Group Holding Limited
//
#import "backend/metal/MetalDequantize.hpp"
#import "backend/metal/MNNMetalContext.h"
#import "core/Macro.h"
#import "backend/metal/MetalBackend.hpp"
#if MNN_METAL_ENABLED
namespace MNN {
MetalDequantize::MetalDequantize(Backend *backend, const Dequantize *dq)
: Execution(backend), mModeFormat(dq->modelFormat()), mType(dq->type()), mMode(dq->mode()) {
// nothing to do
}
ErrorCode MetalDequantize::onMinCombined(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
auto encoder = [context encoder];
auto bandwidth = (MetalBandwidth){};
switch (mType) {
case DataType_DT_QUINT8:
bandwidth = [context load:@"dequantize_min_combined_uint8" encoder:encoder];
break;
case DataType_DT_QUINT16:
bandwidth = [context load:@"dequantize_min_combined_uint16" encoder:encoder];
break;
case DataType_DT_QINT8:
bandwidth = [context load:@"dequantize_min_combined_int8" encoder:encoder];
break;
case DataType_DT_QINT16:
bandwidth = [context load:@"dequantize_min_combined_int16" encoder:encoder];
break;
case DataType_DT_QINT32:
bandwidth = [context load:@"dequantize_min_combined_int32" encoder:encoder];
break;
default:
MNN_ASSERT(false); // unsupported type
return NOT_SUPPORT;
}
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
ErrorCode MetalDequantize::onMinFirst(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
auto encoder = [context encoder];
auto bandwidth = (MetalBandwidth){};
switch (mType) {
case DataType_DT_QUINT8:
bandwidth = [context load:@"dequantize_min_first_uint8" encoder:encoder];
break;
case DataType_DT_QUINT16:
bandwidth = [context load:@"dequantize_min_first_uint16" encoder:encoder];
break;
case DataType_DT_QINT8:
bandwidth = [context load:@"dequantize_min_first_int8" encoder:encoder];
break;
case DataType_DT_QINT16:
bandwidth = [context load:@"dequantize_min_first_int16" encoder:encoder];
break;
case DataType_DT_QINT32:
bandwidth = [context load:@"dequantize_min_first_int32" encoder:encoder];
break;
default:
MNN_ASSERT(false); // unsupported type
return NOT_SUPPORT;
}
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
ErrorCode MetalDequantize::onScaled(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
auto encoder = [context encoder];
auto bandwidth = (MetalBandwidth){};
switch (mType) {
case DataType_DT_QUINT8:
bandwidth = [context load:@"dequantize_scaled_uint8" encoder:encoder];
break;
case DataType_DT_QUINT16:
bandwidth = [context load:@"dequantize_scaled_uint16" encoder:encoder];
break;
case DataType_DT_QINT8:
bandwidth = [context load:@"dequantize_scaled_int8" encoder:encoder];
break;
case DataType_DT_QINT16:
bandwidth = [context load:@"dequantize_scaled_int16" encoder:encoder];
break;
case DataType_DT_QINT32:
bandwidth = [context load:@"dequantize_scaled_int32" encoder:encoder];
break;
default:
MNN_ASSERT(false); // unsupported type
return NOT_SUPPORT;
}
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
[context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
ErrorCode MetalDequantize::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
switch (mMode) {
case QuantizeMode_MIN_COMBINED:
return onMinCombined(inputs, outputs);
case QuantizeMode_MIN_FIRST:
return onMinFirst(inputs, outputs);
case QuantizeMode_SCALED:
return onScaled(inputs, outputs);
default:
return NOT_SUPPORT;
}
}
class MetalDequantizeCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
return new MetalDequantize(backend, op->main_as_Dequantize());
}
};
REGISTER_METAL_OP_CREATOR(MetalDequantizeCreator, OpType_Dequantize);
} // namespace MNN
#endif /* MNN_METAL_ENABLED */

View File

@ -21,10 +21,13 @@ public:
MetalEltwise(Backend *backend, EltwiseType type);
virtual ~MetalEltwise() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
EltwiseType mType;
void encode(NSString *kernel, const Tensor *input0, const Tensor *input1, const Tensor *output);
void encode(const Tensor *input0, const Tensor *input1, const Tensor *output);
id<MTLComputePipelineState> mPipeline;
id<MTLBuffer> mConst;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -11,23 +11,33 @@
using namespace metal;
kernel void eltwise_prod(device const ftype4 *in0 [[buffer(0)]],
device const ftype4 *in1 [[buffer(1)]],
device ftype4 *out [[buffer(2)]],
kernel void eltwise_prod(device const ftype *in0 [[buffer(0)]],
device const ftype *in1 [[buffer(1)]],
device ftype *out [[buffer(2)]],
constant int4& shape [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
out[(int)gid] = in0[(int)gid] * in1[(int)gid];
if ((int)gid < shape.x) {
out[(int)gid] = in0[(int)gid] * in1[(int)gid];
}
}
kernel void eltwise_max(device const ftype4 *in0 [[buffer(0)]],
device const ftype4 *in1 [[buffer(1)]],
device ftype4 *out [[buffer(2)]],
kernel void eltwise_max(device const ftype *in0 [[buffer(0)]],
device const ftype *in1 [[buffer(1)]],
device ftype *out [[buffer(2)]],
constant int4& shape [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
out[(int)gid] = max(in0[(int)gid], in1[(int)gid]);
if ((int)gid < shape.x) {
out[(int)gid] = max(in0[(int)gid], in1[(int)gid]);
}
}
kernel void eltwise_add(device const ftype4 *in0 [[buffer(0)]],
device const ftype4 *in1 [[buffer(1)]],
device ftype4 *out [[buffer(2)]],
kernel void eltwise_add(device const ftype *in0 [[buffer(0)]],
device const ftype *in1 [[buffer(1)]],
device ftype *out [[buffer(2)]],
constant int4& shape [[buffer(3)]],
uint gid [[thread_position_in_grid]]) {
out[(int)gid] = in0[(int)gid] + in1[(int)gid];
if ((int)gid < shape.x) {
out[(int)gid] = in0[(int)gid] + in1[(int)gid];
}
}

View File

@ -14,26 +14,12 @@
#if MNN_METAL_ENABLED
namespace MNN {
MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend), mType(type) {
// nothing to do
}
void MetalEltwise::encode(NSString *kernel, const Tensor *input0, const Tensor *input1, const Tensor *output) {
auto metal = static_cast<MetalBackend *>(this->backend());
MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend) {
auto metal = static_cast<MetalBackend *>(backend);
auto context = (__bridge MNNMetalContext *)metal->context();
auto encoder = [context encoder];
auto bandwidth = [context load:kernel encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
[context dispatchEncoder:encoder threads:{ (NSUInteger)output->elementSize() / 4, 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
}
ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
NSString *kernel = nil;
switch (mType) {
switch (type) {
case EltwiseType_PROD:
kernel = @"eltwise_prod";
break;
@ -46,11 +32,32 @@ ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std
default:
break;
}
mPipeline = [context pipelineWithName:kernel];
}
ErrorCode MetalEltwise::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
((int*)(mConst.contents))[0] = outputs[0]->elementSize();
auto metal = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)metal->context();
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outputs[0]->elementSize(), 1, 1)];
return NO_ERROR;
}
void MetalEltwise::encode(const Tensor *input0, const Tensor *input1, const Tensor *output) {
auto metal = static_cast<MetalBackend *>(this->backend());
auto encoder = metal->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
[encoder setBuffer:mConst offset:0 atIndex:3];
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
}
ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto output = outputs[0];
encode(kernel, inputs[0], inputs[1], output);
encode(inputs[0], inputs[1], output);
for (int i = 2; i < inputs.size(); i++) {
encode(kernel, inputs[i], output, output);
encode(inputs[i], output, output);
}
return NO_ERROR;
}

View File

@ -20,11 +20,14 @@ public:
MetalInterp(Backend *backend, const Op* op);
virtual ~MetalInterp() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
int32_t mReiszeType;
id<MTLBuffer> mCordTransform;
id<MTLBuffer> mShape;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -28,8 +28,7 @@ MetalInterp::MetalInterp(Backend *backend, const Op* op)
mReiszeType = interpParam->resizeType();
mShape = [context newDeviceBuffer:7 * sizeof(int) access:CPUWriteOnly];
}
ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
ErrorCode MetalInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto input = inputs[0], output = outputs[0];
@ -43,38 +42,44 @@ ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std:
((int *)mShape.contents)[4] = oh;
((int *)mShape.contents)[5] = ow * oh;
((int *)mShape.contents)[6] = slice;
// encode
auto encoder = [context encoder];
auto bandwidth = (MetalBandwidth){};
if (mReiszeType == 2 || mReiszeType == 1) {
if (2 == mReiszeType) {
bandwidth = [context load:@"resize_bilinear" encoder:encoder];
mPipeline = [context pipelineWithName:@"resize_bilinear"];
} else {
bandwidth = [context load:@"resize_nearest" encoder:encoder];
mPipeline = [context pipelineWithName:@"resize_nearest"];
}
} else if (mReiszeType == 3) {
bandwidth = [context load:@"resize_cubic" encoder:encoder];
mPipeline = [context pipelineWithName:@"resize_cubic"];
} else {
MNN_ASSERT(false);
}
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(ow, oh, slice)];
return NO_ERROR;
}
ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto input = inputs[0], output = outputs[0];
// encode
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mShape offset:0 atIndex:2];
[encoder setBuffer:mCordTransform offset:0 atIndex:3];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)slice }
bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}
class MetalInterpCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
auto interpParam = op->main_as_Interp();
auto type = interpParam->resizeType();
if (type > 3) {
return nullptr;
}
return new MetalInterp(backend, op);
}
};

View File

@ -67,7 +67,7 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
auto e = C->length(0);
auto h = C->length(1);
auto encoder = [context encoder];
auto encoder = backend->encoder();
if (inputs.size() > 2) {
auto bandwidth = [context load:@"matmul_bias" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
@ -78,7 +78,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
[context dispatchEncoder:encoder
threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
bandwidth:bandwidth];
[encoder endEncoding];
} else {
auto bandwidth = [context load:@"matmul" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
@ -88,7 +87,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
[context dispatchEncoder:encoder
threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
bandwidth:bandwidth];
[encoder endEncoding];
}
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;

View File

@ -13,7 +13,6 @@
extern void ___MetalReLUCreator__OpType_ReLU__();
extern void ___MetalPoolingCreator__OpType_Pooling__();
extern void ___MetalScaleCreator__OpType_Scale__();
extern void ___MetalDequantizeCreator__OpType_Dequantize__();
extern void ___MetalInterpCreator__OpType_Interp__();
extern void ___MetalUnaryCreator__OpType_UnaryOp__();
extern void ___MetalRasterCreator__OpType_Raster__();
@ -33,7 +32,6 @@ void registerMetalOps() {
___MetalReLUCreator__OpType_ReLU__();
___MetalPoolingCreator__OpType_Pooling__();
___MetalScaleCreator__OpType_Scale__();
___MetalDequantizeCreator__OpType_Dequantize__();
___MetalInterpCreator__OpType_Interp__();
___MetalUnaryCreator__OpType_UnaryOp__();
___MetalRasterCreator__OpType_Raster__();

View File

@ -20,9 +20,13 @@ public:
MetalPReLU(Backend *backend, const float *slope, int count);
virtual ~MetalPReLU() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
id<MTLBuffer> mSlope;
id<MTLBuffer> mShape;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
bool mShareChannel = false;
};

View File

@ -18,35 +18,41 @@ MetalPReLU::MetalPReLU(Backend *backend, const float *slope, int count) : Execut
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
mSlope = [context newDeviceBuffer:UP_DIV(count, 4) * 4 * sizeof(float) bytes:slope access:CPUWriteOnly];
mShareChannel = 1 == count;
if (!mShareChannel) {
mShape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
}
mPipeline = [context pipelineWithName:mShareChannel ? @"prelu" : @"prelu_slopes"];
}
ErrorCode MetalPReLU::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto output = outputs[0];
int w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
if (mShareChannel) {
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w * h * z * b, 1, 1)];
} else {
((int *)mShape.contents)[0] = w * h;
((int *)mShape.contents)[1] = z;
((int *)mShape.contents)[2] = b;
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w * h, z, b)];
}
return NO_ERROR;
}
ErrorCode MetalPReLU::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto input = inputs[0], output = outputs[0];
int w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
auto encoder = [context encoder];
auto bandwidth = [context load:mShareChannel ? @"prelu" : @"prelu_slopes" encoder:encoder];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mSlope offset:0 atIndex:2];
if (mShareChannel) {
[context dispatchEncoder:encoder
threads:{ (NSUInteger) w * h * z * b, (NSUInteger)1, (NSUInteger)1 }
bandwidth:bandwidth];
} else {
auto shape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
((int *)shape.contents)[0] = w * h;
((int *)shape.contents)[1] = z;
((int *)shape.contents)[2] = b;
[encoder setBuffer:shape offset:0 atIndex:3];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) w * h, (NSUInteger)z, (NSUInteger)b }
bandwidth:bandwidth];
if (!mShareChannel) {
[encoder setBuffer:mShape offset:0 atIndex:3];
}
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}

View File

@ -33,6 +33,9 @@ private:
int mPadX;
int mPadY;
id<MTLBuffer> mConstBuffer;
MTLSize mGroup;
MTLSize mLocal;
id<MTLComputePipelineState> mPipeline;
};
} // namespace MNN

View File

@ -59,27 +59,23 @@ ErrorCode MetalPooling::onResize(const std::vector<Tensor *> &inputs, const std:
((int *)mConstBuffer.contents)[8] = strideHeight;
((int *)mConstBuffer.contents)[9] = padWidth;
((int *)mConstBuffer.contents)[10] = padHeight;
auto ow = output->width(), oh = output->height(), slice = UP_DIV(output->channel(), 4) * output->batch();
mPipeline = [context pipelineWithName:(mPoolType == PoolType_MAXPOOL) ? @"pooling_max" : @"pooling_avg"];
auto size = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(ow, oh, slice)];
mLocal = size.second;
mGroup = size.first;
return NO_ERROR;
}
ErrorCode MetalPooling::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto input = inputs[0], output = outputs[0];
auto ow = output->width(), oh = output->height(), slice = UP_DIV(output->channel(), 4) * output->batch();
auto encoder = [context encoder];
auto bandwidth = [context load:(mPoolType == PoolType_MAXPOOL) ? @"pooling_max" : @"pooling_avg" encoder:encoder];
bandwidth.zAxisProtected = YES;
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)slice }
bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder dispatchThreadgroups:mGroup threadsPerThreadgroup:mLocal];
return NO_ERROR;
}

View File

@ -36,7 +36,7 @@ ErrorCode MetalROIPooling::onExecute(const std::vector<Tensor *> &inputs, const
((int *)shape.contents)[6] = oz;
((float *)shape.contents)[7] = mSpatialScale;
auto encoder = [context encoder];
auto encoder = backend->encoder();
auto bandwidth = [context load:@"ROI_pooling" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)roi->deviceId() offset:0 atIndex:1];
@ -45,7 +45,6 @@ ErrorCode MetalROIPooling::onExecute(const std::vector<Tensor *> &inputs, const
[context dispatchEncoder:encoder
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz *ob }
bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}

View File

@ -24,10 +24,13 @@ public:
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
std::vector<std::tuple<id<MTLBuffer>, id<MTLBuffer>, std::vector<int> > > mTempInputCopy;
std::vector<std::tuple<id<MTLBuffer>, id<MTLBuffer>, MTLSize, MTLSize> > mTempInputCopy;
std::shared_ptr<Tensor> mTempOutput;
bool mNeedZero = false;
id<MTLBuffer> mOutputPtr;
bool mFast = false;
id<MTLComputePipelineState> mBlitPipeline;
std::vector<id<MTLBuffer>> mShapeTemp;
};
} // namespace MNN

View File

@ -6,12 +6,14 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#if MNN_METAL_ENABLED
#import "backend/metal/MetalRaster.hpp"
#import "backend/metal/MNNMetalContext.h"
#import "core/Macro.h"
#import "backend/metal/MetalBackend.hpp"
#include "core/TensorUtils.hpp"
#if MNN_METAL_ENABLED
#include "core/OpCommonUtils.hpp"
namespace MNN {
struct SamplerInfo {
@ -44,10 +46,67 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
auto des = TensorUtils::getDescribe(input);
auto outputDes = TensorUtils::getDescribe(output);
mNeedZero = !TensorUtils::regionIsFull(input);
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend())->context();
auto bytes = outputs[0]->getType().bytes();
mTempInput.clear();
mTempOutput = nullptr;
mOutputPtr = (__bridge id<MTLBuffer>)((void*)output->deviceId());
#ifndef MNN_METAL_FORBID_RASTER_C4
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
bool fast = true;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
fast = false;
break;
}
if (!OpCommonUtils::canBlitFast(slice, output)) {
fast = false;
break;
}
}
mFast = fast;
if (fast) {
NSString* kernelName = nil;
switch (bytes) {
case 4:
kernelName = @"blit_int32x4";
break;
case 2:
kernelName = @"blit_int64";
break;
case 1:
kernelName = @"blit_int32";
break;
default:
break;
}
if (outputs[0]->getType().code == halide_type_float) {
#if MNN_METAL_FULL_PRECISION
kernelName = @"blit_int32x4";
#else
kernelName = @"blit_int64";
#endif
}
mBlitPipeline = [context pipelineWithName:kernelName];
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
Tensor::InsideDescribe::Region newRegion;
OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
newRegion.dst.offset /= 4;
newRegion.src.offset /= 4;
SamplerInfo info;
writeSamplerInfo(info, newRegion);
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(newRegion.size[0], newRegion.size[1], newRegion.size[2])];
auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)newRegion.origin->deviceId(), buffer, local.first, local.second));
}
return NO_ERROR;
}
}
#endif
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
auto origin = slice.origin;
@ -87,47 +146,6 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
if (nullptr != mTempOutput) {
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend())->context();
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (nullptr == slice.origin) {
continue;
}
SamplerInfo info;
writeSamplerInfo(info, slice);
auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
auto iter = mTempInput.find(slice.origin);
std::vector<int> regionSize = {
slice.size[0], slice.size[1], slice.size[2]
};
if (iter != mTempInput.end()) {
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)iter->second->deviceId(), buffer, regionSize));
continue;
}
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, regionSize));
}
return NO_ERROR;
}
ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
if (mNeedZero) {
auto size = outputs[0]->elementSize();
if (mTempOutput != nullptr) {
size = mTempOutput->elementSize();
}
size = ((size + 3) / 4) * 4 * sizeof(metal_float);
auto blitEncode = [context encoderBlit];
[blitEncode fillBuffer:mOutputPtr range:NSMakeRange(0, size) value:0];
[blitEncode endEncoding];
}
for (auto& iter : mTempInput) {
backend->onCopyBuffer(iter.first, iter.second.get());
}
auto encoder = [context encoder];
auto bytes = outputs[0]->getType().bytes();
NSString* kernelName = nil;
switch (bytes) {
case 4:
@ -143,21 +161,69 @@ ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std:
break;
}
if (outputs[0]->getType().code == halide_type_float) {
kernelName = @"blit_float";
#if MNN_METAL_FULL_PRECISION
kernelName = @"blit_int32";
#else
kernelName = @"blit_int16";
#endif
}
auto bandwidth = [context load:kernelName encoder:encoder];
mBlitPipeline = [context pipelineWithName:kernelName];
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (nullptr == slice.origin) {
continue;
}
SamplerInfo info;
writeSamplerInfo(info, slice);
auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
auto iter = mTempInput.find(slice.origin);
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(slice.size[0], slice.size[1], slice.size[2])];
if (iter != mTempInput.end()) {
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)iter->second->deviceId(), buffer, local.first, local.second));
continue;
}
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, local.first, local.second));
}
mShapeTemp.clear();
for (auto& iter : mTempInput) {
id<MTLBuffer> shape = [context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly];
mShapeTemp.emplace_back(std::move(shape));
}
if (nullptr != mTempOutput) {
mShapeTemp.emplace_back([context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly]);
}
return NO_ERROR;
}
ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
if (mNeedZero) {
backend->flushEncoder();
auto size = outputs[0]->elementSize();
if (mTempOutput != nullptr) {
size = mTempOutput->elementSize();
}
size = ((size + 3) / 4) * 4 * sizeof(metal_float);
auto blitEncode = [context encoderBlit];
[blitEncode fillBuffer:mOutputPtr range:NSMakeRange(0, size) value:0];
[blitEncode endEncoding];
}
auto encoder = backend->encoder();
int index = 0;
for (auto& iter : mTempInput) {
backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
}
[encoder setComputePipelineState:mBlitPipeline];
for (auto& iter : mTempInputCopy) {
[encoder setBuffer: std::get<0>(iter) offset:0 atIndex: 0];
[encoder setBuffer: mOutputPtr offset:0 atIndex: 1];
[encoder setBuffer: std::get<1>(iter) offset:0 atIndex: 2];
auto& size = std::get<2>(iter);
[context dispatchEncoder:encoder
threads:{ (NSUInteger)size[0], (NSUInteger)size[1], (NSUInteger)size[2]}
bandwidth:bandwidth];
[encoder dispatchThreadgroups:std::get<2>(iter) threadsPerThreadgroup:std::get<3>(iter)];
}
[encoder endEncoding];
if (nullptr != mTempOutput) {
backend->onCopyBuffer(mTempOutput.get(), outputs[0]);
backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
}
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;

View File

@ -31,13 +31,12 @@ ErrorCode MetalReLU::onExecute(const std::vector<Tensor *> &inputs, const std::v
}
MNN_ASSERT(mSlope.length == sizeof(float));
auto encoder = [context encoder];
auto encoder = backend->encoder();
auto bandwidth = [context load:simd ? @"relu_x4" : @"relu_x1" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mSlope offset:0 atIndex:2];
[context dispatchEncoder:encoder threads:{ size, 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}

View File

@ -29,12 +29,11 @@ ErrorCode MetalReLU6::onExecute(const std::vector<Tensor *> &inputs, const std::
size /= 4;
}
auto encoder = [context encoder];
auto encoder = backend->encoder();
auto bandwidth = [context load:simd ? @"relu6_x4" : @"relu6_x1" encoder:encoder];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[context dispatchEncoder:encoder threads:{ size, 1, 1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}

View File

@ -18,15 +18,16 @@ namespace MNN {
class MetalReduction : public Execution {
public:
MetalReduction(Backend *backend, const ReductionParam *reduction);
MetalReduction(Backend *backend, const ReductionParam *reduction, halide_type_t type);
virtual ~MetalReduction() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
NSString *mKernel;
std::vector<std::shared_ptr<Tensor>> mMiddles;
std::vector<int> mDims;
int mAxis;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
id<MTLBuffer> mConst;
};
} // namespace MNN

View File

@ -16,106 +16,69 @@
#if MNN_METAL_ENABLED
namespace MNN {
MetalReduction::MetalReduction(Backend *backend, const ReductionParam *p) : Execution(backend) {
auto integer = p->dType() == DataType_DT_INT32;
MetalReduction::MetalReduction(Backend *backend, const ReductionParam *p, halide_type_t type) : Execution(backend) {
auto integer = type.code == halide_type_int;
NSString *kernel;
switch (p->operation()) {
case ReductionType_SUM:
mKernel = integer ? @"reduce_sum_s" : @"reduce_sum_f";
kernel = integer ? @"reduce_sum_s" : @"reduce_sum_f";
break;
case ReductionType_ASUM:
case ReductionType_SUMSQ:
MNN_ASSERT(false); // both un-supported
break;
case ReductionType_MEAN:
mKernel = integer ? @"reduce_mean_s" : @"reduce_mean_f";
kernel = integer ? @"reduce_mean_s" : @"reduce_mean_f";
break;
case ReductionType_MAXIMUM:
mKernel = integer ? @"reduce_max_s" : @"reduce_max_f";
kernel = integer ? @"reduce_max_s" : @"reduce_max_f";
break;
case ReductionType_MINIMUM:
mKernel = integer ? @"reduce_min_s" : @"reduce_min_f";
kernel = integer ? @"reduce_min_s" : @"reduce_min_f";
break;
case ReductionType_PROD:
mKernel = integer ? @"reduce_prod_s" : @"reduce_prod_f";
kernel = integer ? @"reduce_prod_s" : @"reduce_prod_f";
break;
default:
break;
}
for (int i = 0; i < p->dim()->size(); i++) {
mDims.push_back(p->dim()->data()[i]);
}
// The reduce after geometry compute has only one axis
mAxis = p->dim()->data()[0];
auto mkbn = static_cast<MetalBackend *>(backend);
auto context = (__bridge MNNMetalContext *)mkbn->context();
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
mPipeline = [context pipelineWithName:kernel];
}
ErrorCode MetalReduction::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mDims.size() <= 1)
return NO_ERROR;
auto backend = static_cast<MetalBackend *>(this->backend());
auto &input = inputs[0];
mMiddles.clear();
for (int i = 0; i < mDims.size() - 1; i++) {
auto middle = new Tensor(input->buffer().dimensions);
TensorUtils::copyShape(input, middle);
for (int j = 0; j <= i; j++) {
middle->buffer().dim[mDims[j]].extent = 1;
}
backend->onAcquireBuffer(middle, Backend::DYNAMIC);
mMiddles.push_back(std::shared_ptr<Tensor>(middle));
}
for (auto &t : mMiddles) {
backend->onReleaseBuffer(t.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
static void encode(MNNMetalContext *context, id<MTLComputeCommandEncoder> encoder, MetalBandwidth bandwidth,
const Tensor *input, const Tensor *output, int axis) {
auto ib = input->buffer();
int outsideSize = 1, axisSize = 1, insideSize = 1;
if (axis >= 0) {
for (int i = 0; i < axis; i++)
outsideSize *= ib.dim[i].extent;
axisSize = ib.dim[axis].extent;
for (int i = axis + 1; i < ib.dimensions; i++)
insideSize *= ib.dim[i].extent;
} else {
axisSize = input->elementSize();
for (int i = 0; i < mAxis; i++) {
outsideSize *= inputs[0]->length(i);
}
auto shape = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
((int *)shape.contents)[0] = outsideSize;
((int *)shape.contents)[1] = axisSize;
((int *)shape.contents)[2] = insideSize;
((int *)shape.contents)[3] = axisSize * insideSize;
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:shape offset:0 atIndex:2];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) outsideSize, (NSUInteger)insideSize, 1 }
bandwidth:bandwidth];
axisSize = inputs[0]->length(mAxis);
for (int i = mAxis + 1; i < inputs[0]->dimensions(); i++) {
insideSize *= inputs[0]->length(i);
}
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
((int *)mConst.contents)[0] = outsideSize;
((int *)mConst.contents)[1] = axisSize;
((int *)mConst.contents)[2] = insideSize;
((int *)mConst.contents)[3] = axisSize * insideSize;
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outsideSize, insideSize, 1)];
return NO_ERROR;
}
ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto &input = inputs[0], &output = outputs[0];
auto encoder = [context encoder];
auto bandwidth = [context load:mKernel encoder:encoder];
if (mDims.empty()) {
encode(context, encoder, bandwidth, input, output, -1);
} else if (mDims.size() == 1) {
encode(context, encoder, bandwidth, input, output, mDims[0]);
} else {
encode(context, encoder, bandwidth, input, mMiddles[0].get(), mDims[0]);
for (int i = 1; i < mMiddles.size(); i++) {
encode(context, encoder, bandwidth, mMiddles[i - 1].get(), mMiddles[i].get(), mDims[i]);
}
encode(context, encoder, bandwidth, mMiddles.back().get(), output, mDims.back());
}
[encoder endEncoding];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:mConst offset:0 atIndex:2];
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
MNN_PRINT_ENCODER(context, encoder);
return NO_ERROR;
}
@ -123,7 +86,18 @@ ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const s
class MetalReductionCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
return new MetalReduction(backend, op->main_as_ReductionParam());
auto param = op->main_as_ReductionParam();
switch (param->operation()) {
case ReductionType_ALL:
case ReductionType_ANY:
case ReductionType_ASUM:
case ReductionType_SUMSQ:
return nullptr;
default:
break;
};
return new MetalReduction(backend, op->main_as_ReductionParam(), inputs[0]->getType());
}
};
REGISTER_METAL_OP_CREATOR(MetalReductionCreator, OpType_Reduction);

View File

@ -20,11 +20,15 @@ class MetalScale : public Execution {
public:
MetalScale(Backend *backend, const Scale *scale);
virtual ~MetalScale() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
id<MTLBuffer> mScale;
id<MTLBuffer> mBias;
id<MTLBuffer> mConst;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -17,18 +17,6 @@ struct scale_shape {
int batch;
};
kernel void scale_tf(const device ftype *in [[buffer(0)]],
device ftype *out [[buffer(1)]],
constant scale_shape &s [[buffer(2)]],
const device float *scales [[buffer(3)]],
const device float *biasTerms [[buffer(4)]],
uint2 gid [[thread_position_in_grid]]) {
if ((int)gid.x >= s.steps || (int)gid.y >= s.size * s.batch) return;
out[int(gid.y) * s.steps + int(gid.x)] =
in [int(gid.y) * s.steps + int(gid.x)] * ftype(scales[int(gid.x)]) + ftype(biasTerms[int(gid.x)]);
}
kernel void scale_ca(const device ftype4 *in [[buffer(0)]],
device ftype4 *out [[buffer(1)]],
constant scale_shape &s [[buffer(2)]],

View File

@ -21,50 +21,39 @@ MetalScale::MetalScale(Backend *backend, const Scale *scale) : Execution(backend
mBias = scale->biasData()
? [context newDeviceBuffer:channel4 * sizeof(float) bytes:scale->biasData()->data() access:CPUWriteOnly]
: [context newDeviceBuffer:channel4 * sizeof(float) access:CPUTransparent];
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
mPipeline = [context pipelineWithName:@"scale_ca"];
}
ErrorCode MetalScale::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
ErrorCode MetalScale::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto input = inputs[0], output = outputs[0];
auto output = outputs[0];
// shape
auto tf = input->getDimensionType() == Tensor::TENSORFLOW;
int w = output->width();
int h = output->height();
int c = output->channel();
int z = UP_DIV(c, 4);
((int *)mConst.contents)[0] = w*h;
((int *)mConst.contents)[1] = z;
((int *)mConst.contents)[2] = output->batch();
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w*h, z * outputs[0]->batch(), 1)];
return NO_ERROR;
}
auto shape = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
((int *)shape.contents)[0] = w * h;
((int *)shape.contents)[2] = output->batch();
ErrorCode MetalScale::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto input = inputs[0], output = outputs[0];
auto encoder = [context encoder];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:shape offset:0 atIndex:2];
[encoder setBuffer:mConst offset:0 atIndex:2];
[encoder setBuffer:mScale offset:0 atIndex:3];
[encoder setBuffer:mBias offset:0 atIndex:4];
// tensorflow
if (tf) {
((int *)shape.contents)[1] = c;
auto bandwidth = [context load:@"scale_tf" encoder:encoder];
[context dispatchEncoder:encoder
threads:{ (NSUInteger) c, (NSUInteger)w * h * output->batch(), 1 }
bandwidth:bandwidth];
}
// caffe
else {
((int *)shape.contents)[1] = z;
auto bandwidth = [context load:@"scale_ca" encoder:encoder];
[context dispatchEncoder:encoder
threads:{ (NSUInteger)w * h, (NSUInteger)z * output->batch(), 1 }
bandwidth:bandwidth];
}
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}

View File

@ -21,9 +21,13 @@ public:
MetalUnary(Backend *backend, UnaryOpOperation optype);
virtual ~MetalUnary() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
UnaryOpOperation mOpType;
id<MTLBuffer> mConstBuffer;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
};
} // namespace MNN

View File

@ -17,12 +17,12 @@ struct unary_shape {
int size;
};
static inline ftype4 neg(ftype4 value) { return -value; }
static inline ftype4 square(ftype4 value) { return value * value; }
static inline ftype4 expm1(ftype4 value) {return exp(value) - 1;}
static inline ftype4 reciprocal(ftype4 value) {return 1.0/(value);}
static inline ftype4 sigmoid(ftype4 value) {return 1.f / (1.f + exp(-value));}
static inline ftype4 log1p(ftype4 value) {return log(1.f + value);}
static inline float4 neg(float4 value) { return -value; }
static inline float4 square(float4 value) { return value * value; }
static inline float4 expm1(float4 value) {return exp(value) - 1;}
static inline float4 reciprocal(float4 value) {return 1.0/(value);}
static inline float4 sigmoid(float4 value) {return 1.f / (1.f + exp(-value));}
static inline float4 log1p(float4 value) {return log(1.f + value);}
#define define_op(op) \
kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
@ -31,7 +31,7 @@ kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
uint3 gid [[thread_position_in_grid]]) { \
if (gid.x < (uint)s.width) { \
int off = gid.z * s.size + gid.y * s.width + gid.x; \
out[off] = op(in[off]); \
out[off] = (ftype4)(op((float4)(in[off]))); \
} \
} \

View File

@ -14,10 +14,6 @@
#if MNN_METAL_ENABLED
namespace MNN {
MetalUnary::MetalUnary(Backend *backend, UnaryOpOperation optype) : Execution(backend), mOpType(optype) {
// nothing to do
}
static NSString *kernelForType(UnaryOpOperation type) {
#define op_case(type, imp) \
case UnaryOpOperation_##type: \
@ -56,26 +52,34 @@ static NSString *kernelForType(UnaryOpOperation type) {
}
}
MetalUnary::MetalUnary(Backend *backend, UnaryOpOperation optype) : Execution(backend), mOpType(optype) {
auto mtbn = static_cast<MetalBackend *>(backend);
auto context = (__bridge MNNMetalContext *)mtbn->context();
mConstBuffer = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
mPipeline = [context pipelineWithName:kernelForType(mOpType)];
}
ErrorCode MetalUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto mtbn = static_cast<MetalBackend *>(backend());
auto context = (__bridge MNNMetalContext *)mtbn->context();
auto input = inputs[0];
auto element = input->elementSize();
auto sizeDiv4 = UP_DIV(element, 4);
((int *)mConstBuffer.contents)[0] = sizeDiv4;
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(sizeDiv4, 1, 1)];
return NO_ERROR;
}
ErrorCode MetalUnary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
// prepare
auto input = inputs[0], output = outputs[0];
auto element = input->elementSize();
auto sizeDiv4 = UP_DIV(element, 4);
// create shape
auto shape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
((int *)shape.contents)[0] = sizeDiv4;
auto encoder = [context encoder];
auto bandwidth = [context load:kernelForType(mOpType) encoder:encoder];
auto encoder = backend->encoder();
[encoder setComputePipelineState:mPipeline];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
[encoder setBuffer:shape offset:0 atIndex:2];
[context dispatchEncoder:encoder threads:{ (NSUInteger) sizeDiv4, (NSUInteger)1, (NSUInteger)1 } bandwidth:bandwidth];
[encoder endEncoding];
MNN_PRINT_ENCODER(context, encoder);
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
return NO_ERROR;
}

View File

@ -36,7 +36,9 @@ CLRuntime::CLRuntime(const Backend::Info& info){
} else {
mOpenCLRuntime.reset(new OpenCLRuntime(false));
}
if(mOpenCLRuntime.get()){
mCLRuntimeError = mOpenCLRuntime->isCreateError();
if(!mCLRuntimeError){
mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
mStaticImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
@ -69,6 +71,10 @@ void CLRuntime::onGabageCollect(int level) {
//nothing now
}
bool CLRuntime::isCLRuntimeError() {
return mCLRuntimeError;
}
std::map<OpType, OpenCLBackend::Creator*>* gCreator() {
static std::once_flag once;
static std::map<OpType, OpenCLBackend::Creator*>* creators = nullptr;
@ -574,7 +580,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
}
#else
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_FALSE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
#endif
// Host -> OpenCL
MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
@ -658,7 +664,12 @@ class CLRuntimeCreator : public RuntimeCreator {
return nullptr;
}
#endif
return new CLRuntime(info);
auto rt = new CLRuntime(info);
if(rt->isCLRuntimeError() == true) {
delete rt;
return nullptr;
}
return rt;
}
virtual bool onValid(Backend::Info& info) const {
return true;

View File

@ -77,7 +77,8 @@ public:
virtual void onGabageCollect(int level) override;
virtual std::pair<const void*, size_t> onGetCache() override;
virtual bool onSetCache(const void* buffer, size_t size) override;
bool isCLRuntimeError();
private:
Backend::Info mInfo;
std::shared_ptr<ImagePool> mImagePool;
@ -87,6 +88,7 @@ private:
std::shared_ptr<OpenCLRuntime> mOpenCLRuntime;
BackendConfig::PrecisionMode mPrecision;
bool mCLRuntimeError = false;
friend class OpenCLBackend;

View File

@ -45,12 +45,12 @@ OpenCLRuntime::OpenCLRuntime(bool permitFloat16) {
std::vector<cl::Platform> platforms;
cl_int res = cl::Platform::get(&platforms);
MNN_CHECK_CL_SUCCESS(res);
if(platforms.size() > 0){
if(platforms.size() > 0 && res == CL_SUCCESS){
cl::Platform::setDefault(platforms[0]);
std::vector<cl::Device> gpuDevices;
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
if(1 <= gpuDevices.size()){
if(1 <= gpuDevices.size() && res == CL_SUCCESS){
mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[0]);
const std::string deviceName = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();

View File

@ -248,7 +248,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
return blob;
}
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len) {
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const flatbuffers::Vector<float> *alpha) {
// MNN_ERROR("sparse:%d\n", 1);
unsigned short shape[64] = {0};
uint32_t ucMapSize = 0;
@ -332,7 +332,21 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len) {
}
// set blob data with idx and weight idx
{
memset(blob, 0, Size * sizeof(signed char));
if (alpha->size() == 2 * shape[0]) {
auto alphaPtr = alpha->data();
int area = Size / shape[0];
for (int i = 0; i < shape[0]; i++) {
float min = alphaPtr[2*i];
float scale = alphaPtr[2*i+1];
int zeroQuant = -128;
if (scale > 1e-6) {
zeroQuant = round((0.0f - min) / scale) + (-128);
}
memset(blob+area*i, zeroQuant, area * sizeof(signed char));
}
} else {
memset(blob, 0, Size * sizeof(signed char)); //backward compability with previous symmetric weight quant
}
int iPreIdx = 0;
for (int i = 0; i < nnz; i++) {
iPreIdx += arrIdx[i];
@ -358,7 +372,7 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
buffer = ReadQuanData_c(originBuffer, &weightLength);
}
if (2 == quan->type()) {
buffer = ReadSparseQuanData_c(originBuffer, &weightLength);
buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha());
}
// read fp16 data
if (3 == quan->type()) {
@ -424,14 +438,27 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
MNN_PRINT("Alloc memory error for extract idst int8/ Back to float\n");
return nullptr;
}
auto outputCount = result->alpha.size();
int outputCount = 0;
if (quan->readType() != 0) {
outputCount = result->alpha.size() / 2;
} else {
outputCount = result->alpha.size(); // backward compability with previous symmetric quantization
}
int partWeightSize = weightLength / outputCount;
for (int o = 0; o < outputCount; ++o) {
auto dstW = result->weightFloat.get() + o * partWeightSize;
auto srcW = result->weight.get() + o * partWeightSize;
float alpha = result->alpha.get()[o];
for (int j = 0; j < partWeightSize; ++j) {
dstW[j] = ((float)srcW[j]) * alpha * quan->quantScale();
if (result->alpha.size() == 2 * outputCount) {
float min = result->alpha.get()[2*o];
float alpha = result->alpha.get()[2*o+1];
for (int j = 0; j < partWeightSize; ++j) {
dstW[j] = (( (float)srcW[j] - (-128) ) * alpha + min) * quan->quantScale();
}
} else {
float alpha = result->alpha.get()[o];
for (int j = 0; j < partWeightSize; ++j) {
dstW[j] = ((float)srcW[j]) * alpha * quan->quantScale();
}
}
}

View File

@ -32,16 +32,16 @@ Session::Session(Schedule::ScheduleInfo&& info, Interpreter::SessionMode callBac
defaultInfo.numThread = 1;
mTensors = std::move(info.allTensors);
for (auto& iter : info.pipelineInfo) {
auto runtime = mRuntime.first.find(iter.first.type)->second.get();
auto rt = mRuntime.first.find(iter.first.type)->second.get();
auto cpuRuntime = mRuntime.second;
std::shared_ptr<Backend> first(runtime->onCreate());
std::shared_ptr<Backend> first(rt->onCreate());
std::shared_ptr<Backend> second;
if (first->type() == MNN_FORWARD_CPU) {
second = first;
} else {
second.reset(cpuRuntime->onCreate());
}
std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, runtime->onGetCompilerType() == Runtime::Compiler_Geometry));
std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, rt->onGetCompilerType() == Runtime::Compiler_Geometry));
mPipelines.emplace_back(std::move(newPipeline));
}
mInputs = std::move(info.inputTensors);

View File

@ -469,7 +469,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
newSrc[index] = srcSrc[i];
}
if (dstSize.size() > sizeNum) {
for (int i = 3; i >= 0; i--) {
for (int i = 2; i >= 0; i--) {
dstReg.size[i] = i < dstSize.size() ? dstSize[i] : 1;
}
}

View File

@ -36,8 +36,8 @@ static void _ConverterInterp(const Interp* resize, InterpT* dstInfo, int inW, in
switch (resize->ctm()) {
case CoordinateTransformationMode_NotSet:
{
// For compability
if (resize->halfPixelCenters()) {
// For compability, old model's nearest don't support halfpixels
if (resize->halfPixelCenters() && resize->resizeType() != 1) {
dstInfo->heightScale = (float)(inH) / (float)(outH);
dstInfo->widthScale = (float)(inW) / (float)(outW);
dstInfo->widthOffset = 0.5f * dstInfo->widthScale - 0.5f;
@ -194,8 +194,17 @@ public:
static void _create() {
std::shared_ptr<GeometryComputer> comp(new GeometryImageOp);
GeometryComputer::registerGeometryComputer(
comp, {OpType_ConvInt8, OpType_ConvolutionDepthwise, OpType_DeconvolutionDepthwise,
OpType_Pooling, OpType_Interp, OpType_Resize, OpType_Int8ToFloat, OpType_FloatToInt8});
comp, {
OpType_ConvInt8,
OpType_DepthwiseConvInt8,
OpType_ConvolutionDepthwise,
OpType_DeconvolutionDepthwise,
OpType_Pooling,
OpType_Interp,
OpType_Resize,
OpType_Int8ToFloat,
OpType_FloatToInt8
});
}
REGISTER_GEOMETRY(GeometryImageOp, _create);

View File

@ -27,6 +27,9 @@ class ConcatSizeComputer : public SizeComputer {
// Concat-inputs may have scalar which should be delete
for (const auto& input : inputs) {
auto inputDimensions = input->buffer().dimensions;
if (input->size() <= 0) {
continue;
}
::memcpy(ob.dim, input->buffer().dim, sizeof(halide_dimension_t) * inputDimensions);
ob.dimensions = inputDimensions;
ob.type = input->buffer().type;

View File

@ -15,20 +15,46 @@ class ArgMaxTest : public MNNTestCase {
public:
virtual ~ArgMaxTest() = default;
virtual bool run() {
auto input = _Input({4, 4}, NHWC);
input->setName("input_tensor");
auto ArgMax_ = [](VARP input, int axis, int topK, int outMaxVal) {
using namespace MNN;
// input = _checkNC4HW4(input);
std::unique_ptr<OpT> op(new OpT);
op->main.type = OpParameter_ArgMax;
op->type = OpType_ArgMax;
op->main.value = new ArgMaxT;
op->main.AsArgMax()->axis = axis;
op->main.AsArgMax()->outMaxVal = outMaxVal;
op->main.AsArgMax()->topK = topK;
op->main.AsArgMax()->softmaxThreshold = 0;
return (Variable::create(Expr::create(std::move(op), {input})));
};
auto input_nhwc = _Input({4, 4}, NHWC);
auto input_nchw = _Input({4, 4}, NC4HW4);
input_nhwc->setName("input_tensor_nhwc");
input_nchw->setName("input_tensor_nchw");
// set input data
const float inpudata[] = {-1.0, 2.0, -3.0, 4.0, 5.0, -6.0, 7.0, -8.0,
-9.0, -10.0, 11.0, 12.0, 13.0, 14.0, -15.0, -16.0};
auto inputPtr = input->writeMap<float>();
const float inpudata[] = {-1.0, 2.0, -3.0, 4.0,
5.0, -6.0, 7.0, -8.0,
-9.0, -10.0, 11.0, 12.0,
13.0, 14.0, -15.0, -16.0};
auto inputPtr = input_nhwc->writeMap<float>();
memcpy(inputPtr, inpudata, 16 * sizeof(float));
input->unMap();
auto output_0 = _ArgMax(input, 0);
auto output_1 = _ArgMax(input, 1);
inputPtr = input_nchw->writeMap<float>();
memcpy(inputPtr, inpudata, 16 * sizeof(float));
input_nhwc->unMap();
input_nchw->unMap();
auto output_0 = _ArgMax(input_nhwc, 0);
auto output_1 = _ArgMax(input_nhwc, 1);
auto output_2 = ArgMax_(input_nchw, 1, 2, 0);
auto output_3 = ArgMax_(input_nchw, 1, 1, 1);
const std::vector<int> expectedOutput_0 = {3, 3, 2, 2};
const std::vector<int> expectedOutput_1 = {3, 2, 3, 1};
const std::vector<float> expectedOutput_2 = {3, 1, 2, 0, 3, 2, 1, 0};
const std::vector<float> expectedOutput_3 = {3, 4, 2, 7, 3, 12, 1, 14};
auto gotOutput_0 = output_0->readMap<int>();
auto gotOutput_1 = output_1->readMap<int>();
auto gotOutput_2 = output_2->readMap<float>();
auto gotOutput_3 = output_3->readMap<float>();
if (!checkVector<int>(gotOutput_0, expectedOutput_0.data(), 4, 0)) {
MNN_ERROR("ArgMaxTest test axis_0 failed!\n");
return false;
@ -37,6 +63,14 @@ public:
MNN_ERROR("ArgMaxTest test axis_1 failed!\n");
return false;
}
if (!checkVector<float>(gotOutput_2, expectedOutput_2.data(), 8, 0)) {
MNN_ERROR("ArgMaxTest test axis_1_top2 failed!\n");
return false;
}
if (!checkVector<float>(gotOutput_3, expectedOutput_3.data(), 8, 0)) {
MNN_ERROR("ArgMaxTest test axis_1_outVal failed!\n");
return false;
}
return true;
}
};

View File

@ -98,4 +98,40 @@ public:
return true;
}
};
class MatMulSpeedConstTest : public MNNTestCase {
public:
virtual bool run() {
int e = 540, h = 540, l = 320;
auto res = _runConst(e, h, l);
if (!res) {
return false;
}
return _runConst(1024, 1024, 1024);
}
bool _runConst(int e, int h, int l) {
{
// Use Conv1x1 instead of MatMul
auto x0 = _Input({1, l, 1, h}, NC4HW4, halide_type_of<float>());
auto y = _Conv(0.0f, 0.0f, x0, {l, e}, {1, 1});
Variable::prepareCompute({y});
const auto time = 100;
MNN_PRINT("MatMul B Const (Conv1x1): [%d, %d, %d], run %d\n", h, l, e, time);
{
AUTOTIME;
//Prepare
x0->writeMap<float>();
y->readMap<float>();
}
AUTOTIME;
for (int t = 0; t < time; ++t) {
x0->writeMap<float>();
y->readMap<float>();
}
}
return true;
}
};
MNNTestSuiteRegister(MatMulSpeedTest, "speed/MatMulTest");
MNNTestSuiteRegister(MatMulSpeedConstTest, "speed/MatMulBConstTest");

View File

@ -55,6 +55,7 @@ public:
bool saveHalfFloat;
bool forTraining = false;
int weightQuantBits = 0;// If weightQuantBits > 0, it means the bit
bool weightQuantAsymmetric = false;
// The path of the model compression file that stores the int8 calibration table
// or sparse parameters.
std::string compressionParamsFile = "";

View File

@ -46,6 +46,9 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
"debug", "Enable debugging mode.")(
"forTraining", "whether or not to save training ops BN and Dropout, default: false", cxxopts::value<bool>())(
"weightQuantBits", "save conv/matmul/LSTM float weights to int8 type, only optimize for model size, 2-8 bits, default: 0, which means no weight quant", cxxopts::value<int>())(
"weightQuantAsymmetric", "the default weight-quant uses SYMMETRIC quant method, which is compatible with old MNN versions. "
"you can try set --weightQuantAsymmetric to use asymmetric quant method to improve accuracy of the weight-quant model in some cases, "
"but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. default: false", cxxopts::value<bool>())(
"compressionParamsFile",
"The path of the compression parameters that stores activation, "
"weight scales and zero points for quantization or information "
@ -159,6 +162,9 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
if (result.count("weightQuantBits")) {
modelPath.weightQuantBits = result["weightQuantBits"].as<int>();
}
if (result.count("weightQuantAsymmetric")) {
modelPath.weightQuantAsymmetric = true;
}
if (result.count("saveStaticModel")) {
modelPath.saveStaticModel = true;
}

View File

@ -17,6 +17,8 @@
#include "writeFb.hpp"
#include "cpp/ConfigFile.hpp"
#include <MNN/MNNDefine.h>
#include "cli.hpp"
#include "../../common/Global.hpp"
using namespace MNN;
using namespace std;
@ -33,6 +35,23 @@ static float findAbsMax(const float *weights, const int count) {
return absMax;
}
static std::vector<float> findMinMax(const float *weights, const int count) {
float min = weights[0];
float max = weights[0];
for (int i = 1; i < count; i++) {
float value = weights[i];
if (value > max) {
max = value;
}
if (value < min) {
min = value;
}
}
return {min, max};
}
static void WriteBlobDim(ostream &out, std::vector<int> dims)
{
char tmp[4];
@ -44,6 +63,7 @@ static void WriteBlobDim(ostream &out, std::vector<int> dims)
out.write((const char*)(&tmpShort), 2);
}
}
static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
{
memset(buf, 0, buf_len);
@ -72,55 +92,111 @@ static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigne
}
}
static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel)
static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
{
setWeight.clear();
for (int i = 0; i < channel; i++)
{
float alpha = alphaData[i];
if (alpha <= 1e-6f)
if (asymmetricQuantFlag) {
for (int i = 0; i < channel; i++)
{
setWeight.insert(0);
continue;
float min = alphaData[2*i];
float alpha = alphaData[2*i+1];
if (alpha <= 1e-6f)
{
setWeight.insert(-128);
continue;
}
for (int j = 0; j < area; j++)
{
float weight = weightData[i * area + j];
setWeight.insert(round((weight - min) / alpha) + (-128));
}
}
for (int j = 0; j < area; j++)
} else {
for (int i = 0; i < channel; i++)
{
float weight = weightData[i * area + j];
setWeight.insert(round(weight / alpha));
float alpha = alphaData[i];
if (alpha <= 1e-6f)
{
setWeight.insert(0);
continue;
}
for (int j = 0; j < area; j++)
{
float weight = weightData[i * area + j];
setWeight.insert(round(weight / alpha));
}
}
}
}
static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, int iMaxStep = -1)
static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1)
{
nnz = 0;
int iPreIdx = 0;
float sparsity;
for (int i = 0; i < weightSize; i++)
{
if (fabs(weightData[i]) > 1e-8f)
{
nnz++;
iPreIdx = i;
}
if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
{
nnz++;
iPreIdx = i;
}
}
if (asymmetricQuantFlag) {
for (int i = 0; i < weightSize; i++)
{
float min = alphaData[2*(i/area)];
float alpha = alphaData[2*(i/area)+1];
int zeroQuant = -128;
if (alpha > 1e-6) {
zeroQuant = round((0.0f - min) / alpha) + (-128);
}
float weight = weightData[i];
int value = -128;
if (alpha > 1e-6)
{
value = round((weight - min) / alpha) + (-128);
}
if (value != zeroQuant)
{
nnz++;
iPreIdx = i;
}
if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
{
nnz++;
iPreIdx = i;
}
}
} else {
for (int i = 0; i < weightSize; i++)
{
float alpha = alphaData[i / area];
float weight = weightData[i];
int value = 0;
if (alpha > 1e-6f)
{
value = round(weight / alpha);
}
if (value != 0)
{
nnz++;
iPreIdx = i;
}
if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
{
nnz++;
iPreIdx = i;
}
}
}
sparsity = 1 - 1.0f * nnz / weightSize;
return sparsity;
}
unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize)
unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
{
size_t szBestSize = 1000000000;
unsigned int best_nnz = 0;
for (int i = 2; i < 9; i++)
{
unsigned int nnz = 0;
GetSparsity(weightData, weightSize, nnz, pow(2, i) - 1);
GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1);
size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize);
if (tmp < szBestSize)
{
@ -132,12 +208,12 @@ unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned ch
return best_nnz;
}
static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel)
static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
{
//push values into buffer
//Find int values in all blobs and check;
set<int> setWeight;
GetWeightSet(setWeight, weightData, alphaData, area, channel);
GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
int iCount = setWeight.size();
int iNeedBits = ceil(log2(iCount));
if (iNeedBits > 8) {
@ -155,19 +231,38 @@ static void WriteCQBlobs(ostream &out, const float* weightData, const float* alp
{
char *arr = new char[area * channel];
char *tmp = arr;
for (int i = 0; i < channel; i++)
{
float alpha = alphaData[i];
for (int j = 0; j < area; j++)
if (asymmetricQuantFlag) {
for (int i = 0; i < channel; i++)
{
float weight = weightData[i * area + j];
int value = 0;
if (alpha > 1e-6f)
float min = alphaData[2*i];
float alpha = alphaData[2*i+1];
for (int j = 0; j < area; j++)
{
value = round(weight / alpha);
float weight = weightData[i * area + j];
int value = -128;
if (alpha > 1e-6f)
{
value = round((weight - min) / alpha) + (-128);
}
*tmp = mapWeight[value];
tmp++;
}
}
} else {
for (int i = 0; i < channel; i++)
{
float alpha = alphaData[i];
for (int j = 0; j < area; j++)
{
float weight = weightData[i * area + j];
int value = 0;
if (alpha > 1e-6f)
{
value = round(weight / alpha);
}
*tmp = mapWeight[value];
tmp++;
}
*tmp = mapWeight[value];
tmp++;
}
}
FillBuffer(buf, buf_len, arr, area * channel, iNeedBits);
@ -194,10 +289,10 @@ static void WriteCQBlobs(ostream &out, const float* weightData, const float* alp
delete[] buf;
}
static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel)
static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
{
set<int> setWeight;
GetWeightSet(setWeight, weightData, alphaData, area, channel);
GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
int iDataNeedBits = ceil(log2(setWeight.size()));
unsigned int nnz = 0;
int weightSize = area * channel;
@ -210,7 +305,7 @@ static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const fl
}
}
unsigned char iNeedBits;
nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits);
nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag);
//weight buf
size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz));
char* data_buf = new char[data_buf_len];
@ -224,33 +319,69 @@ static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const fl
int iMaxStep = pow(2, iNeedBits) - 1;
int iPreIdx = 0;
unsigned char* dTmp = data_arr;
for (int i = 0; i < weightSize; i++)
{
float alpha = alphaData[i / area];
float weight = weightData[i];
int value = 0;
if (alpha > 1e-6f)
if (asymmetricQuantFlag) {
for (int i = 0; i < weightSize; i++)
{
value = round(weight / alpha);
}
float min = alphaData[2*(i/area)];
float alpha = alphaData[2*(i/area)+1];
int zeroQuant = -128;
if (alpha > 1e-6) {
zeroQuant = round((0.0f - min) / alpha) + (-128);
}
if (value != 0)
{
*dTmp = mapWeight[value];
*tmp = i - iPreIdx;
iPreIdx = i;
tmp++;
dTmp++;
}
if (i - iPreIdx >= iMaxStep)
{
*dTmp = mapWeight[0];
*tmp = i - iPreIdx;
iPreIdx = i;
tmp++;
dTmp++;
}
}
float weight = weightData[i];
int value = -128;
if (alpha > 1e-6)
{
value = round((weight - min) / alpha) + (-128);
}
if (value != zeroQuant)
{
*dTmp = mapWeight[value];
*tmp = i - iPreIdx;
iPreIdx = i;
tmp++;
dTmp++;
}
if (i - iPreIdx >= iMaxStep)
{
*dTmp = mapWeight[zeroQuant];
*tmp = i - iPreIdx;
iPreIdx = i;
tmp++;
dTmp++;
}
}
} else {
for (int i = 0; i < weightSize; i++)
{
float alpha = alphaData[i / area];
float weight = weightData[i];
int value = 0;
if (alpha > 1e-6f)
{
value = round(weight / alpha);
}
if (value != 0)
{
*dTmp = mapWeight[value];
*tmp = i - iPreIdx;
iPreIdx = i;
tmp++;
dTmp++;
}
if (i - iPreIdx >= iMaxStep)
{
*dTmp = mapWeight[0];
*tmp = i - iPreIdx;
iPreIdx = i;
tmp++;
dTmp++;
}
}
}
FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits);
FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits);
delete[] arr_idx;
@ -392,6 +523,8 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
}
auto CastParamsToInt8 = [](std::unique_ptr<MNN::OpT>& op, int bits) {
auto gConverterConfig = Global<modelConfig>::Get();
bool asymmetricQuantFlag = gConverterConfig->weightQuantAsymmetric;
const auto opType = op->type;
// Bits must from 2-8
bits = std::max(bits, 2);
@ -408,20 +541,35 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
const int weightSize = param->weight.size();
int kernelNum = common->outputCount;
int kernelSize = weightSize / kernelNum;
std::vector<float> scales(kernelNum);
auto weightData = param->weight.data();
for (int k = 0; k < kernelNum; k++) {
int beginIndex = k * kernelSize;
auto absMax = findAbsMax(weightData + beginIndex, kernelSize);
// TODO: Support low bit
scales[k] = absMax / thredhold;
std::vector<float> scales;
if (asymmetricQuantFlag) {
scales.resize(kernelNum*2);
for (int k = 0; k < kernelNum; k++) {
int beginIndex = k * kernelSize;
auto minAndMax = findMinMax(weightData + beginIndex, kernelSize);
float min = minAndMax[0];
float max = minAndMax[1];
float scale = (max - min) / (127 + 128);
scales[2*k] = min;
scales[2*k+1] = scale;
}
} else {
scales.resize(kernelNum);
for (int k = 0; k < kernelNum; k++) {
int beginIndex = k * kernelSize;
auto absMax = findAbsMax(weightData + beginIndex, kernelSize);
scales[k] = absMax / thredhold;
}
}
std::ostringstream outputStringStreamCQ;
WriteCQBlobs(outputStringStreamCQ, weightData, scales.data(), kernelSize, kernelNum);
WriteCQBlobs(outputStringStreamCQ, weightData, scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
std::ostringstream outputStringStreamSQ;
WriteSparseQuanBlobs(outputStringStreamSQ, weightData, scales.data(), kernelSize, kernelNum);
WriteSparseQuanBlobs(outputStringStreamSQ, weightData, scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
param->quanParameter.reset(new MNN::IDSTQuanT);
auto tempString = outputStringStreamCQ.str();
@ -435,6 +583,9 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
::memcpy(param->quanParameter->buffer.data(), tempString.data(), tempString.size());
param->quanParameter->alpha = std::move(scales);
param->quanParameter->quantScale = 1.0f;
if (asymmetricQuantFlag) {
param->quanParameter->readType = kernelNum;
}
}
break;
}

View File

@ -418,13 +418,14 @@ bool fuseConstIntoSubgraph(MNN::NetT* net, const std::vector<MNN::SubGraphProtoT
}
// Try Optimize Subgraph for more const op get
auto* ctx = Global<OptimizeContext>::Get();
std::unordered_map<std::string, VARP> empty;
for (auto mutable_subgraph : modifiedSubGraph) {
std::unique_ptr<MNN::NetT> subnet(new MNN::NetT);
subnet->oplists = std::move(mutable_subgraph->nodes);
subnet->tensorName = std::move(mutable_subgraph->tensors);
subnet->sourceType = ctx->source;
std::unique_ptr<MNN::NetT> new_subnet = optimizeNetImpl(subnet, {}, ctx->is_train, false /*verbose*/);
std::unique_ptr<MNN::NetT> new_subnet = optimizeNetImpl(subnet, empty, ctx->is_train, false /*verbose*/);
mutable_subgraph->nodes = std::move(subnet->oplists);
MNN::SubGraphProtoT* new_subgraph = mutable_subgraph;
@ -460,7 +461,8 @@ std::unique_ptr<MNN::NetT> optimizeNet(std::unique_ptr<MNN::NetT>& originNet, bo
auto ctx = OptimizeContext{subgraphs, forTraining, originNet->sourceType};
Global<OptimizeContext>::Reset(&ctx);
std::unique_ptr<MNN::NetT> net = optimizeNetImpl(originNet, {}, forTraining);
std::unordered_map<std::string, VARP> empty;
std::unique_ptr<MNN::NetT> net = optimizeNetImpl(originNet, empty, forTraining);
fuseConstIntoSubgraph(net.get(), ctx.completed_subgraphs);
for (auto* subgraph : ctx.completed_subgraphs) {
net->subgraphs.emplace_back(subgraph);

View File

@ -7,6 +7,13 @@
//
#include <MNN/expr/Optimizer.hpp>
#include <stdexcept>
#define MNN_THROW_CHECK(success, log) \
if(!(success)){ \
MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
throw std::runtime_error("Error for onnx convert");\
}
namespace MNN {
namespace Express {

View File

@ -0,0 +1,149 @@
//
// FuseTfPrelu.cpp
// MNNConverter
//
// Created by MNN on 2020/11/13.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "../TemplateMerge.hpp"
#include "MNN/expr/MathOp.hpp"
#include "MNN/expr/NeuralNetWorkOp.hpp"
#include "MNN_generated.h"
#include "../../common/Global.hpp"
namespace MNN {
namespace Express {
enum PreluCases {
None,
OCRCustom,
};
static auto gRegister = []() {
auto match = [](EXPRP expr) {
PreluCases preluCase = PreluCases::None;
// ocr custom case of prelu
{
if (nullptr == expr->get()) {
return false;
}
if (expr->get()->type() != OpType_Eltwise) {
return false;
}
if (expr->get()->main_as_Eltwise()->type() != EltwiseType_SUM) {
return false;
}
if (expr->inputs().size() != 2) {
return false;
}
auto leftReluVar = expr->inputs().at(0);
auto leftReluExpr = leftReluVar->expr().first;
if (leftReluExpr->get() == nullptr) {
return false;
}
if (leftReluExpr->get()->type() != OpType_ReLU) {
return false;
}
auto rightBinaryVar = expr->inputs().at(1);
auto rightBinaryExpr = rightBinaryVar->expr().first;
if (rightBinaryExpr->get() == nullptr) {
return false;
}
if (rightBinaryExpr->get()->type() != OpType_BinaryOp) {
return false;
}
if (rightBinaryExpr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_MUL) {
return false;
}
auto rightBinaryConstVar = rightBinaryExpr->inputs().at(0);
auto rightBinaryConstExpr = rightBinaryConstVar->expr().first;
if (rightBinaryConstExpr->get() != nullptr) {
return false;
}
auto rightBinaryReluVar = rightBinaryExpr->inputs().at(1);
auto rightBinaryReluExpr = rightBinaryReluVar->expr().first;
if (rightBinaryReluExpr->get() == nullptr) {
return false;
}
bool cond = ((rightBinaryConstExpr->inputType() == VARP::CONSTANT) && (rightBinaryReluExpr->get()->type() == OpType_ReLU));
if (!cond) {
return false;
}
auto unaryVar = rightBinaryReluExpr->inputs().at(0);
auto unaryExpr = unaryVar->expr().first;
if (unaryExpr->get() == nullptr) {
return false;
}
if (unaryExpr->get()->type() != OpType_UnaryOp) {
return false;
}
if (unaryExpr->get()->main_as_UnaryOp()->opType() != UnaryOpOperation_NEG) {
return false;
}
auto leftSourceVar = leftReluExpr->inputs().at(0);
auto rightSourceVar = unaryExpr->inputs().at(0);
if (leftSourceVar->expr() != rightSourceVar->expr()) {
return false;
}
preluCase = PreluCases::OCRCustom;
}
Global<PreluCases>::Reset(&preluCase);
if (preluCase != PreluCases::None) {
return true;
}
return false;
};
auto transform = [](EXPRP expr) {
auto preluCase = Global<PreluCases>::Get();
// ocr custom case of prelu
if (*preluCase == PreluCases::OCRCustom) {
auto leftReluVar = expr->inputs().at(0);
auto leftReluExpr = leftReluVar->expr().first;
auto sourceVar = leftReluExpr->inputs().at(0);
auto rightBinaryVar = expr->inputs().at(1);
auto rightBinaryExpr = rightBinaryVar->expr().first;
auto rightBinaryConstVar = rightBinaryExpr->inputs().at(0);
std::unique_ptr<MNN::OpT> PreluOp(new OpT);
PreluOp->type = OpType_PReLU;
PreluOp->name = expr->name();
PreluOp->main.type = OpParameter_PRelu;
PreluOp->main.value = new PReluT;
auto PreluParameter = PreluOp->main.AsPRelu();
{
auto PreluPoint = _Negative(rightBinaryConstVar);
auto PreluPointInfo = PreluPoint->getInfo();
auto PreluPointPtr = PreluPoint->readMap<float>();
PreluParameter->slope.resize(PreluPointInfo->size);
::memcpy(PreluParameter->slope.data(), PreluPointPtr, PreluPointInfo->size * sizeof(float));
PreluParameter->slopeCount = PreluPointInfo->size;
}
auto newVar = Variable::create(Expr::create(PreluOp.get(), {sourceVar}, expr->outputSize()));
newVar->setName(expr->outputName(0));
Expr::replace(expr, newVar->expr().first);
return true;
}
return false;
};
TemplateMerge::getInstance("Merge").insertTemplate("FuseTfPrelu", match, transform);
return true;
}();
}
} // namespace MNN

Some files were not shown because too many files have changed in this diff Show More