mirror of https://github.com/alibaba/MNN.git
commit
135f3d0f84
|
|
@ -1,6 +1,6 @@
|
|||
Pod::Spec.new do |s|
|
||||
s.name = "MNN"
|
||||
s.version = "0.1.1"
|
||||
s.version = "1.1.0"
|
||||
s.summary = "MNN"
|
||||
|
||||
s.description = <<-DESC
|
||||
|
|
@ -46,6 +46,7 @@ Pod::Spec.new do |s|
|
|||
'schema/current/*.{h}',\
|
||||
'3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
|
||||
'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/utils/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
|
||||
'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
|
||||
|
|
@ -57,6 +58,6 @@ Pod::Spec.new do |s|
|
|||
'express/**/*.{hpp,cpp}'
|
||||
s.header_mappings_dir = 'include'
|
||||
|
||||
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1'}
|
||||
s.pod_target_xcconfig = {'METAL_LIBRARY_FILE_BASE' => 'mnn', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include" "$(PODS_TARGET_SRCROOT)/3rd_party/flatbuffers/include" "$(PODS_TARGET_SRCROOT)/source" "$(PODS_TARGET_SRCROOT)/3rd_party/half"', 'GCC_PREPROCESSOR_DEFINITIONS' => '$(inherited) MNN_CODEGEN_REGISTER=1 MNN_SUPPORT_TFLITE_QUAN=1 MNN_METAL_ENABLED=1'}
|
||||
s.user_target_xcconfig = { 'OTHER_LDFLAGS' => '-force_load $(BUILD_DIR)/$(CONFIGURATION)$(EFFECTIVE_PLATFORM_NAME)/MNN/libMNN.a', 'HEADER_SEARCH_PATHS' => '"$(PODS_TARGET_SRCROOT)/include"' }
|
||||
end
|
||||
|
|
|
|||
|
|
@ -77,22 +77,45 @@ public:
|
|||
MNN_ASSERT(expr->get() != nullptr);
|
||||
MNN_ASSERT(expr->get()->type() == OpType_BatchNorm);
|
||||
auto bnPa = expr->get()->main_as_BatchNorm();
|
||||
auto& inputs = expr->inputs();
|
||||
int dims = 4;
|
||||
if (!inputs.empty()) {
|
||||
auto info = inputs[0]->getInfo();
|
||||
if (nullptr != info) {
|
||||
dims = info->dim.size();
|
||||
}
|
||||
}
|
||||
mEps = bnPa->epsilon();
|
||||
mMomentum = m;
|
||||
mChannels = bnPa->channels();
|
||||
std::vector<int> statShape;
|
||||
std::vector<int> reductionDims;
|
||||
int channels = mChannels;
|
||||
if (dims == 2) {
|
||||
statShape = {1, channels};
|
||||
mReductionDims = {0};
|
||||
}
|
||||
if (dims == 3) {
|
||||
statShape = {1, channels, 1};
|
||||
mReductionDims = {0, 2};
|
||||
}
|
||||
if (dims == 4) {
|
||||
statShape = {1, channels, 1, 1};
|
||||
mReductionDims = {0, 2, 3};
|
||||
}
|
||||
MNN_ASSERT(bnPa->biasData()->size() == mChannels);
|
||||
mBias = _TrainableParam(bnPa->biasData()->data(), {1, mChannels, 1, 1}, NCHW);
|
||||
mBias = _TrainableParam(bnPa->biasData()->data(), statShape, NCHW);
|
||||
MNN_ASSERT(bnPa->slopeData()->size() == mChannels);
|
||||
mScale = _TrainableParam(bnPa->slopeData()->data(), {1, mChannels, 1, 1}, NCHW);
|
||||
mScale = _TrainableParam(bnPa->slopeData()->data(), statShape, NCHW);
|
||||
MNN_ASSERT(bnPa->meanData()->size() == mChannels);
|
||||
mRunningMean = _Const(bnPa->meanData()->data(), {1, mChannels, 1, 1}, NCHW);
|
||||
mRunningMean = _Const(bnPa->meanData()->data(), statShape, NCHW);
|
||||
MNN_ASSERT(bnPa->meanData()->size() == mChannels);
|
||||
mRunningVariance = _Const(bnPa->varData()->data(), {1, mChannels, 1, 1}, NCHW);
|
||||
mRunningVariance = _Const(bnPa->varData()->data(), statShape, NCHW);
|
||||
addParameter(mScale);
|
||||
addParameter(mBias);
|
||||
mRunningVariancePos = addParameter(mRunningVariance);
|
||||
mRunningMeanPos = addParameter(mRunningMean);
|
||||
mReductionDims = {0, 2, 3};
|
||||
|
||||
setType("BatchNorm");
|
||||
}
|
||||
BatchNormModule(const int channels, const int dims = 4, const float m = 0.99, const float e = 1e-5) {
|
||||
|
|
@ -100,14 +123,16 @@ public:
|
|||
mEps = e;
|
||||
mChannels = channels;
|
||||
|
||||
MNN_ASSERT((dims == 2) || (dims == 4));
|
||||
|
||||
std::vector<int> statShape;
|
||||
std::vector<int> reductionDims;
|
||||
if (dims == 2) {
|
||||
statShape = {1, channels};
|
||||
mReductionDims = {0};
|
||||
}
|
||||
if (dims == 3) {
|
||||
statShape = {1, channels, 1};
|
||||
mReductionDims = {0, 2};
|
||||
}
|
||||
if (dims == 4) {
|
||||
statShape = {1, channels, 1, 1};
|
||||
mReductionDims = {0, 2, 3};
|
||||
|
|
|
|||
|
|
@ -0,0 +1,31 @@
|
|||
## scp package_scripts/linux/build.sh mnnteam@30.6.159.68:/mnt/partition4/CI/scripts
|
||||
# scp ~/.ssh/id_rsa* mnnteam@30.6.159.68:/mnt/partition4/CI
|
||||
# ssh mnnteam@30.6.159.68
|
||||
# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@gitlab.alibaba-inc.com:AliNN/AliNNPrivate.git
|
||||
# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@gitlab.alibaba-inc.com:AliNN/MNN.git
|
||||
# docker run --name CI_tmp --rm -it -v /mnt/partition4/CI:/mnt reg.docker.alibaba-inc.com/shuhui/manylinux_2014 bash /mnt/scripts/build.sh -r git@github.com:alibaba/MNN.git
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -r code_repo"
|
||||
echo -e "\t-r code repository"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts 'r:' opt; do
|
||||
case "$opt" in
|
||||
r ) CODE_REPO=$OPTARG ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
yes | cp /mnt/id_rsa* ~/.ssh 2>/dev/null
|
||||
cd /root
|
||||
git clone $CODE_REPO MNN && cd MNN
|
||||
mkdir MNN
|
||||
cp -r include/* MNN
|
||||
./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
|
||||
# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
|
||||
./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
|
||||
# ./package_scripts/linux/build_bridge.sh -o MNN-CPU/py_bridge
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
# ./package_scripts/linux/build_lib.sh -o MNN-CPU/lib
|
||||
# ./package_scripts/linux/build_lib.sh -o MNN-CPU-OPENCL/lib -b
|
||||
|
||||
# MNN
|
||||
# |--- Debug
|
||||
# | |--- libMNN.a
|
||||
# | |--- libMNN.so
|
||||
# |
|
||||
# |--- Release
|
||||
# |--- libMNN.a
|
||||
# |--- libMNN.so
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# clear and create package directory
|
||||
./schema/generate.sh
|
||||
rm -rf $path && mkdir -p $path
|
||||
mkdir -p $path/Debug
|
||||
mkdir -p $path/Release
|
||||
|
||||
PACKAGE_PATH=$(realpath $path)
|
||||
|
||||
CMAKE_ARGS="-DMNN_SEP_BUILD=OFF"
|
||||
if [ ! -z $opencl ]; then
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
fi
|
||||
|
||||
rm -rf build && mkdir build
|
||||
pushd build
|
||||
|
||||
# Debug Dynamic MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=ON .. && make -j24
|
||||
cp libMNN.so $PACKAGE_PATH/Debug
|
||||
|
||||
# Debug Static MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j24
|
||||
cp libMNN.a $PACKAGE_PATH/Debug
|
||||
|
||||
# Release Dynamic MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON .. && make -j24
|
||||
cp libMNN.so $PACKAGE_PATH/Release
|
||||
|
||||
# Release Static MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j24
|
||||
cp libMNN.a $PACKAGE_PATH/Release
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
# ./package_scripts/linux/build_tools.sh -o MNN-CPU/tools
|
||||
# ./package_scripts/linux/build_tools.sh -o MNN-CPU-OPENCL/tools -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# clear and create package directory
|
||||
./schema/generate.sh
|
||||
rm -rf $path && mkdir -p $path
|
||||
TOOLS_PATH=$(realpath $path)
|
||||
|
||||
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_PORTABLE_BUILD=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
|
||||
if [ ! -z $opencl ]; then
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
fi
|
||||
|
||||
rm -rf build && mkdir build
|
||||
pushd build
|
||||
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS .. && make -j8
|
||||
cp *.out $TOOLS_PATH
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
# ./package_scripts/linux/build_whl.sh -o MNN-CPU/py_whl
|
||||
# ./package_scripts/linux/build_whl.sh -o MNN-CPU-OPENCL/py_whl -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:v:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
./schema/generate.sh
|
||||
rm -rf $path && mkdir -p $path
|
||||
PACKAGE_PATH=$(realpath $path)
|
||||
|
||||
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON"
|
||||
if [ ! -z $opencl ]; then
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
fi
|
||||
rm -rf pymnn_build && mkdir pymnn_build
|
||||
pushd pymnn_build
|
||||
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
|
||||
popd
|
||||
|
||||
pushd pymnn/pip_package
|
||||
rm -rf dist && mkdir dist
|
||||
rm -rf wheelhouse && mkdir wheelhouse
|
||||
|
||||
#Compile wheels
|
||||
for PYBIN in /opt/python/*/bin; do
|
||||
"${PYBIN}/pip" install -U numpy
|
||||
"${PYBIN}/python" setup.py bdist_wheel
|
||||
done
|
||||
|
||||
# Bundle external shared libraries into the wheels
|
||||
for whl in dist/*.whl; do
|
||||
auditwheel repair "$whl" --plat manylinux2014_x86_64 -w wheelhouse
|
||||
done
|
||||
cp wheelhouse/* $PACKAGE_PATH
|
||||
|
||||
popd
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
# MNN_Linux
|
||||
# |------- MNN_Linux_lib
|
||||
# |---------- Dynamic_Library
|
||||
# |---------- Static_Library
|
||||
# |------- MNN_Linux_tools
|
||||
|
||||
LINUX_PACKAGE_NAME="MNN_Linux"
|
||||
|
||||
# clear and create package directory
|
||||
./schema/generate.sh
|
||||
LINUX_PACKAGE_PATH="$(pwd)/$LINUX_PACKAGE_NAME"
|
||||
rm -rf $LINUX_PACKAGE_PATH
|
||||
mkdir $LINUX_PACKAGE_PATH && cd $LINUX_PACKAGE_PATH
|
||||
mkdir MNN_Linux_lib && cd MNN_Linux_lib
|
||||
mkdir Dynamic_Library
|
||||
mkdir Static_Library
|
||||
cd ..
|
||||
mkdir MNN_Linux_tools
|
||||
cd ..
|
||||
|
||||
rm -rf build
|
||||
mkdir build && cd build
|
||||
# tools without dependency, static library without sep_build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON .. && make -j$(nproc)
|
||||
pushd ${LINUX_PACKAGE_PATH}
|
||||
cp ../build/*.out MNN_Linux_tools
|
||||
cp ../build/MNNConvert MNN_Linux_tools
|
||||
cp ../build/MNNDump2Json MNN_Linux_tools
|
||||
cp ../build/OnnxClip MNN_Linux_tools
|
||||
cp ../build/libMNN.a MNN_Linux_lib/Static_Library
|
||||
popd
|
||||
|
||||
# dynamic library without sep_build
|
||||
rm CMakeCache.txt
|
||||
cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release .. && make -j$(nproc)
|
||||
cd $LINUX_PACKAGE_PATH
|
||||
cp ../build/libMNN.so MNN_Linux_lib/Dynamic_Library
|
||||
|
||||
# auto zip MNN_Linux_lib MNN_Linux_tools
|
||||
zip -r MNN_Linux_lib.zip MNN_Linux_lib
|
||||
zip -r MNN_Linux_tools.zip MNN_Linux_tools
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
# ./package_scripts/mac/build_lib.sh -o MNN-CPU/lib
|
||||
# ./package_scripts/mac/build_lib.sh -o MNN-CPU-OPENCL/lib -b
|
||||
|
||||
# MNN
|
||||
# |--- Debug
|
||||
# | |--- Dynamic
|
||||
# | |--- Static
|
||||
# |
|
||||
# |--- Release
|
||||
# |--- Dynamic
|
||||
# |--- Static
|
||||
# Only have MNN.framework
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# clear and create package directory
|
||||
./schema/generate.sh
|
||||
rm -rf $path && mkdir -p $path
|
||||
pushd $path
|
||||
mkdir -p Debug/Dynamic
|
||||
mkdir -p Debug/Static
|
||||
mkdir -p Release/Dynamic
|
||||
mkdir -p Release/Static
|
||||
popd
|
||||
|
||||
PACKAGE_PATH=$(realpath $path)
|
||||
|
||||
CMAKE_ARGS="-DMNN_SEP_BUILD=OFF -DMNN_AAPL_FMWK=ON"
|
||||
if [ ! -z $opencl ]; then
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
fi
|
||||
|
||||
rm -rf build && mkdir build
|
||||
pushd build
|
||||
|
||||
# Debug Dynamic MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=ON .. && make -j8
|
||||
cp -R MNN.framework $PACKAGE_PATH/Debug/Dynamic
|
||||
|
||||
# Debug Static MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j8
|
||||
cp -R MNN.framework $PACKAGE_PATH/Debug/Static
|
||||
|
||||
# Release Dynamic MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON .. && make -j8
|
||||
cp -R MNN.framework $PACKAGE_PATH/Release/Dynamic
|
||||
|
||||
# Release Static MNN.framework
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF .. && make -j8
|
||||
cp -R MNN.framework $PACKAGE_PATH/Release/Static
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
# ./package_scripts/mac/build_tools.sh -o MNN-CPU/tools
|
||||
# ./package_scripts/mac/build_tools.sh -o MNN-CPU-OPENCL/tools -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# clear and create package directory
|
||||
./schema/generate.sh
|
||||
rm -rf $path && mkdir -p $path
|
||||
TOOLS_PATH=$(realpath $path)
|
||||
|
||||
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_PORTABLE_BUILD=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
|
||||
if [ ! -z $opencl ]; then
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
fi
|
||||
|
||||
rm -rf build && mkdir build
|
||||
pushd build
|
||||
|
||||
[ -f CMakeCache.txt ] && rm CMakeCache.txt
|
||||
cmake $CMAKE_ARGS .. && make -j8
|
||||
cp *.out $TOOLS_PATH
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
# ./package_scripts/mac/build_whl.sh -o MNN-CPU/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0
|
||||
# ./package_scripts/mac/build_whl.sh -o MNN-CPU-OPENCL/py_whl -v 2.7.17,3.5.7,3.6.9,3.7.4,3.8.0 -b
|
||||
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -o path -v python_versions [-b]"
|
||||
echo -e "\t-o package files output directory"
|
||||
echo -e "\t-v python versions in pyenv"
|
||||
echo -e "\t-b opencl backend"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "o:v:hb" opt; do
|
||||
case "$opt" in
|
||||
o ) path=$OPTARG ;;
|
||||
v ) IFS="," read -a python_versions <<< $OPTARG ;;
|
||||
b ) opencl=true ;;
|
||||
h|? ) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
./schema/generate.sh
|
||||
rm -rf $path && mkdir -p $path
|
||||
PACKAGE_PATH=$(realpath $path)
|
||||
|
||||
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON"
|
||||
if [ ! -z $opencl ]; then
|
||||
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
fi
|
||||
|
||||
rm -rf pymnn_build && mkdir pymnn_build
|
||||
pushd pymnn_build
|
||||
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
|
||||
popd
|
||||
|
||||
pushd pymnn/pip_package
|
||||
rm -rf dist && mkdir dist
|
||||
for env in $python_versions; do
|
||||
pyenv global $env
|
||||
python build_wheel.py
|
||||
done
|
||||
cp dist/* $PACKAGE_PATH
|
||||
|
||||
popd
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
# MNN_Mac
|
||||
# |------- MNN_Mac_lib
|
||||
# |---------- Dynamic_Library
|
||||
# |---------- Static_Library
|
||||
# |---------- MNN.framework
|
||||
# |------- MNN_Mac_tools
|
||||
|
||||
MAC_PACKAGE_NAME="MNN_Mac"
|
||||
|
||||
# clear and create package directory
|
||||
./schema/generate.sh
|
||||
MAC_PACKAGE_PATH=$(pwd)/$MAC_PACKAGE_NAME
|
||||
rm -rf $MAC_PACKAGE_PATH
|
||||
mkdir $MAC_PACKAGE_PATH && cd $MAC_PACKAGE_PATH
|
||||
mkdir MNN_Mac_lib && cd MNN_Mac_lib
|
||||
mkdir Dynamic_Library
|
||||
mkdir Static_Library
|
||||
cd ..
|
||||
mkdir MNN_Mac_tools
|
||||
cd ..
|
||||
|
||||
rm -rf build
|
||||
mkdir build && cd build
|
||||
# tools without dependency, static library without sep_build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON .. && make -j8
|
||||
pushd ${MAC_PACKAGE_PATH}
|
||||
cp ../build/*.out MNN_Mac_tools
|
||||
cp ../build/MNNConvert MNN_Mac_tools
|
||||
cp ../build/MNNDump2Json MNN_Mac_tools
|
||||
cp ../build/OnnxClip MNN_Mac_tools
|
||||
cp ../build/libMNN.a MNN_Mac_lib/Static_Library
|
||||
popd
|
||||
|
||||
# dynamic library without sep_build
|
||||
rm CMakeCache.txt
|
||||
cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release .. && make -j8
|
||||
cd ..
|
||||
cp build/libMNN.dylib ${MAC_PACKAGE_PATH}/MNN_Mac_lib/Dynamic_Library
|
||||
|
||||
# mac framework without sep_build
|
||||
cd build
|
||||
rm CMakeCache.txt
|
||||
cmake -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_AAPL_FMWK=ON .. && make -j8
|
||||
cd $MAC_PACKAGE_PATH
|
||||
cp -r ../build/MNN.framework MNN_Mac_lib
|
||||
|
||||
# auto zip MNN_Mac_lib MNN_Mac_tools
|
||||
zip -r MNN_Mac_lib.zip MNN_Mac_lib
|
||||
zip -r MNN_Mac_tools.zip MNN_Mac_tools
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x64
|
||||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU/lib/x86
|
||||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x64 -opencl
|
||||
# .\package_scripts\win\build_lib.ps1 -path MNN-CPU-OPENCL/lib/x86 -opencl
|
||||
|
||||
# MNN
|
||||
# |-- Debug
|
||||
# | |--- MD
|
||||
# | |--- MT
|
||||
# | |--- Static
|
||||
# |
|
||||
# |-- Release
|
||||
# |--- MD
|
||||
# |--- MT
|
||||
# |--- Static
|
||||
|
||||
Param(
|
||||
[Parameter(Mandatory=$true)][String]$path,
|
||||
[Switch]$opencl
|
||||
)
|
||||
$erroractionpreference = "stop"
|
||||
Remove-Item $path -Recurse -ErrorAction Ignore
|
||||
mkdir -p $path
|
||||
$PACKAGE_PATH = $(Resolve-Path $path).Path
|
||||
|
||||
#clear and create package directory
|
||||
powershell ./schema/generate.ps1
|
||||
pushd $PACKAGE_PATH
|
||||
mkdir -p Debug\MD
|
||||
mkdir -p Debug\MT
|
||||
mkdir -p Debug\Static
|
||||
mkdir -p Release\MD
|
||||
mkdir -p Release\MT
|
||||
mkdir -p Release\Static
|
||||
popd
|
||||
|
||||
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF"
|
||||
if ($opencl) {
|
||||
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
}
|
||||
|
||||
Remove-Item build -Recurse -ErrorAction Ignore
|
||||
mkdir build
|
||||
pushd build
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
|
||||
ninja
|
||||
cp MNN.lib $PACKAGE_PATH\Debug\MT
|
||||
cp MNN.dll $PACKAGE_PATH\Debug\MT
|
||||
cp MNN.pdb $PACKAGE_PATH\Debug\MT
|
||||
rm MNN.*
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
|
||||
ninja
|
||||
cp MNN.lib $PACKAGE_PATH\Debug\MD
|
||||
cp MNN.dll $PACKAGE_PATH\Debug\MD
|
||||
cp MNN.pdb $PACKAGE_PATH\Debug\MD
|
||||
rm MNN.*
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
|
||||
ninja
|
||||
cp MNN.lib $PACKAGE_PATH\Debug\Static
|
||||
rm MNN.*
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
|
||||
ninja
|
||||
cp MNN.lib $PACKAGE_PATH\Release\MT
|
||||
cp MNN.dll $PACKAGE_PATH\Release\MT
|
||||
cp MNN.pdb $PACKAGE_PATH\Release\MT
|
||||
rm MNN.*
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
|
||||
ninja
|
||||
cp MNN.lib $PACKAGE_PATH\Release\MD
|
||||
cp MNN.dll $PACKAGE_PATH\Release\MD
|
||||
cp MNN.pdb $PACKAGE_PATH\Release\MD
|
||||
rm MNN.*
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
|
||||
ninja
|
||||
cp MNN.lib $PACKAGE_PATH\Release\Static
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x64
|
||||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU/tools/x86
|
||||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x64 -opencl
|
||||
# .\package_scripts\win\build_tools.ps1 -path MNN-CPU-OPENCL/tools/x86 -opencl
|
||||
|
||||
Param(
|
||||
[Parameter(Mandatory=$true)][String]$path,
|
||||
[Switch]$opencl
|
||||
)
|
||||
$erroractionpreference = "stop"
|
||||
Remove-Item $path -Recurse -ErrorAction Ignore
|
||||
mkdir -p $path
|
||||
$TOOLS_PATH = $(Resolve-Path $path).Path
|
||||
|
||||
powershell ./schema/generate.ps1
|
||||
|
||||
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_TOOLS=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TEST=ON"
|
||||
if ($opencl) {
|
||||
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
}
|
||||
|
||||
Remove-Item build -Recurse -ErrorAction Ignore
|
||||
mkdir build
|
||||
pushd build
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
|
||||
ninja
|
||||
cp MNNV2Basic.out.exe $TOOLS_PATH
|
||||
cp MNNConvert.exe $TOOLS_PATH
|
||||
cp testModel.out.exe $TOOLS_PATH
|
||||
cp run_test.out.exe $TOOLS_PATH
|
||||
cp quantized.out.exe $TOOLS_PATH
|
||||
cp train.out.exe $TOOLS_PATH
|
||||
cp benchmark.out.exe $TOOLS_PATH
|
||||
cp benchmarkExprModels.out.exe $TOOLS_PATH
|
||||
cp backendTest.out.exe $TOOLS_PATH
|
||||
|
||||
popd
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
|
||||
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
|
||||
# .\package_scripts\win_pymm_package.ps1 -path MNN-CPU-OPENCL/py_whl/x64 -pyenvs "2.7.17,3.5.4,2.6.8,3.7.7,3.8.2"
|
||||
# .\package_scripts\win_pymm_package.ps1 -x86 -path MNN-CPU-OPENCL/py_whl/x86 -pyenvs "2.7.17-win32,3.5.4-win32,2.6.8-win32,3.7.7-win32,3.8.2-win32"
|
||||
Param(
|
||||
[Parameter(Mandatory=$true)][String]$pyenvs,
|
||||
[Parameter(Mandatory=$true)][String]$path,
|
||||
[Switch]$x86,
|
||||
[Switch]$opencl
|
||||
)
|
||||
|
||||
$erroractionpreference = "stop"
|
||||
$python_versions = $pyenvs.Split(",")
|
||||
|
||||
Remove-Item $path -Recurse -ErrorAction Ignore
|
||||
mkdir -p $path
|
||||
$PACKAGE_PATH = $(Resolve-Path $path).Path
|
||||
$ARGS = ""
|
||||
if ($x86) {
|
||||
$ARGS = "--x86"
|
||||
}
|
||||
|
||||
powershell ./schema/generate.ps1
|
||||
|
||||
Remove-Item pymnn_build -Recurse -ErrorAction Ignore
|
||||
mkdir pymnn_build
|
||||
|
||||
pushd pymnn/pip_package
|
||||
Remove-Item dist -Recurse -ErrorAction Ignore
|
||||
mkdir dist
|
||||
|
||||
pushd pymnn_build
|
||||
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON "
|
||||
if ($opencl) {
|
||||
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_OPENCL=ON"
|
||||
}
|
||||
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
|
||||
ninja MNN MNNTrain MNNConvert
|
||||
popd
|
||||
|
||||
Foreach ($env in $python_versions) {
|
||||
pyenv global $env
|
||||
python build_wheel.py $ARGS
|
||||
}
|
||||
cp dist/* $PACKAGE_PATH
|
||||
popd
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
# MNN
|
||||
# |-- Debug
|
||||
# | |--- MD
|
||||
# | |--- MT
|
||||
# |-- Release
|
||||
# |--- MD
|
||||
# |--- MT
|
||||
|
||||
$erroractionpreference = "stop"
|
||||
|
||||
Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN"
|
||||
|
||||
#clear and create package directory
|
||||
powershell ./schema/generate.ps1
|
||||
Set-Variable -Name WINDOWS_PACKAGE_PATH -Value "$(pwd)\$WINDOWS_PACKAGE_NAME"
|
||||
Remove-Item $WINDOWS_PACKAGE_PATH -Recurse -ErrorAction Ignore
|
||||
mkdir $WINDOWS_PACKAGE_PATH\
|
||||
cd $WINDOWS_PACKAGE_PATH
|
||||
mkdir -p Debug\MD
|
||||
mkdir -p Debug\MT
|
||||
mkdir -p Release\MD
|
||||
mkdir -p Release\MT
|
||||
cd ..
|
||||
|
||||
Remove-Item build -Recurse -ErrorAction Ignore
|
||||
mkdir build
|
||||
pushd build
|
||||
# tools without dependency, static library without sep_build
|
||||
#cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
|
||||
#ninja
|
||||
#pushd $WINDOWS_PACKAGE_PATH
|
||||
#cp ..\build\*.exe MNN_Windows_tools
|
||||
#cp ..\build\*.pdb MNN_Windows_tools
|
||||
#cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
|
||||
#popd
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
|
||||
ninja
|
||||
cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MT
|
||||
cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MT
|
||||
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MT
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
|
||||
ninja
|
||||
cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MD
|
||||
cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MD
|
||||
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MD
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
|
||||
ninja
|
||||
cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MT
|
||||
cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MT
|
||||
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MT
|
||||
|
||||
Remove-Item CMakeCache.txt -ErrorAction Ignore
|
||||
cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
|
||||
ninja
|
||||
cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MD
|
||||
cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MD
|
||||
cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MD
|
||||
|
||||
popd
|
||||
|
|
@ -118,7 +118,6 @@
|
|||
489404DE24A2FC2C001E456C /* GeometryReverseSequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */; };
|
||||
489D7A672550FDC800AD896A /* MetalReLU6.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A162550FDC800AD896A /* MetalReLU6.metal */; };
|
||||
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
|
||||
489D7A692550FDC800AD896A /* MetalDequantize.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A182550FDC800AD896A /* MetalDequantize.hpp */; };
|
||||
489D7A6A2550FDC800AD896A /* MetalConvolutionGEMM.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */; };
|
||||
489D7A6B2550FDC800AD896A /* MetalReLU.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A1A2550FDC800AD896A /* MetalReLU.metal */; };
|
||||
489D7A6E2550FDC800AD896A /* MetalROIPooling.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */; };
|
||||
|
|
@ -156,9 +155,7 @@
|
|||
489D7A8F2550FDC900AD896A /* MetalScale.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A3E2550FDC800AD896A /* MetalScale.metal */; };
|
||||
489D7A902550FDC900AD896A /* MetalConvolution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */; };
|
||||
489D7A912550FDC900AD896A /* MetalScale.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A402550FDC800AD896A /* MetalScale.mm */; };
|
||||
489D7A922550FDC900AD896A /* MetalDequantize.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A412550FDC800AD896A /* MetalDequantize.metal */; };
|
||||
489D7A932550FDC900AD896A /* MetalFixedPoint.metal in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A422550FDC800AD896A /* MetalFixedPoint.metal */; };
|
||||
489D7A942550FDC900AD896A /* MetalDequantize.mm in Sources */ = {isa = PBXBuildFile; fileRef = 489D7A432550FDC800AD896A /* MetalDequantize.mm */; };
|
||||
489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A442550FDC800AD896A /* MetalMatMul.hpp */; };
|
||||
489D7A962550FDC900AD896A /* MetalConvolution1x1.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */; };
|
||||
489D7A972550FDC900AD896A /* MetalConvolutionDepthwise.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */; };
|
||||
|
|
@ -299,7 +296,6 @@
|
|||
92FF025523AA0B5A00AC97F6 /* CPUTanh.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */; };
|
||||
92FF025723AA0B5A00AC97F6 /* CPUQuanConvolutionDepthwise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */; };
|
||||
92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */; };
|
||||
92FF025B23AA0B5A00AC97F6 /* CPUPoolGrad.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */; };
|
||||
92FF025C23AA0B5A00AC97F6 /* CPUGatherV2.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */; };
|
||||
92FF025D23AA0B5A00AC97F6 /* CPUInterp.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF00DB23AA0B4800AC97F6 /* CPUInterp.hpp */; };
|
||||
92FF025E23AA0B5A00AC97F6 /* CPUROIPooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF00DC23AA0B4900AC97F6 /* CPUROIPooling.cpp */; };
|
||||
|
|
@ -516,7 +512,6 @@
|
|||
92FF037A23AA0B5A00AC97F6 /* CPUSigmoid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */; };
|
||||
92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */; };
|
||||
92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */; };
|
||||
92FF038023AA0B5A00AC97F6 /* CPUPoolGrad.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */; };
|
||||
92FF038223AA0B5A00AC97F6 /* CPUSetDiff1D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */; };
|
||||
92FF038523AA0B5A00AC97F6 /* CPUMoments.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020623AA0B5300AC97F6 /* CPUMoments.cpp */; };
|
||||
92FF038623AA0B5A00AC97F6 /* CPULinSpace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */; };
|
||||
|
|
@ -678,8 +673,6 @@
|
|||
C43C81E02518944F00A0FF84 /* WinogradHelper.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81D82518944F00A0FF84 /* WinogradHelper.hpp */; };
|
||||
C43C81E12518944F00A0FF84 /* ConvInt8_1xN.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */; };
|
||||
C43C81E22518944F00A0FF84 /* ConvInt8_1xN.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */; };
|
||||
C43C81E32518944F00A0FF84 /* BlstmComputer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */; };
|
||||
C43C81E42518944F00A0FF84 /* BlstmComputer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */; };
|
||||
C43C81EE2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */; };
|
||||
C43C81F32518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81EF2518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */; };
|
||||
C43C81F42518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S in Sources */ = {isa = PBXBuildFile; fileRef = C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */; };
|
||||
|
|
@ -847,7 +840,6 @@
|
|||
489404DD24A2FC2B001E456C /* GeometryReverseSequence.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryReverseSequence.cpp; sourceTree = "<group>"; };
|
||||
489D7A162550FDC800AD896A /* MetalReLU6.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU6.metal; sourceTree = "<group>"; };
|
||||
489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
|
||||
489D7A182550FDC800AD896A /* MetalDequantize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalDequantize.hpp; sourceTree = "<group>"; };
|
||||
489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionGEMM.hpp; sourceTree = "<group>"; };
|
||||
489D7A1A2550FDC800AD896A /* MetalReLU.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalReLU.metal; sourceTree = "<group>"; };
|
||||
489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalROIPooling.hpp; sourceTree = "<group>"; };
|
||||
|
|
@ -885,9 +877,7 @@
|
|||
489D7A3E2550FDC800AD896A /* MetalScale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalScale.metal; sourceTree = "<group>"; };
|
||||
489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolution.hpp; sourceTree = "<group>"; };
|
||||
489D7A402550FDC800AD896A /* MetalScale.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalScale.mm; sourceTree = "<group>"; };
|
||||
489D7A412550FDC800AD896A /* MetalDequantize.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalDequantize.metal; sourceTree = "<group>"; };
|
||||
489D7A422550FDC800AD896A /* MetalFixedPoint.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = MetalFixedPoint.metal; sourceTree = "<group>"; };
|
||||
489D7A432550FDC800AD896A /* MetalDequantize.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalDequantize.mm; sourceTree = "<group>"; };
|
||||
489D7A442550FDC800AD896A /* MetalMatMul.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalMatMul.hpp; sourceTree = "<group>"; };
|
||||
489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolution1x1.hpp; sourceTree = "<group>"; };
|
||||
489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalConvolutionDepthwise.hpp; sourceTree = "<group>"; };
|
||||
|
|
@ -1035,7 +1025,6 @@
|
|||
92FF00D323AA0B4800AC97F6 /* CPUTanh.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUTanh.cpp; sourceTree = "<group>"; };
|
||||
92FF00D523AA0B4800AC97F6 /* CPUQuanConvolutionDepthwise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUQuanConvolutionDepthwise.cpp; sourceTree = "<group>"; };
|
||||
92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolInt8.cpp; sourceTree = "<group>"; };
|
||||
92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUPoolGrad.hpp; sourceTree = "<group>"; };
|
||||
92FF00DA23AA0B4800AC97F6 /* CPUGatherV2.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUGatherV2.hpp; sourceTree = "<group>"; };
|
||||
92FF00DB23AA0B4800AC97F6 /* CPUInterp.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp.hpp; sourceTree = "<group>"; };
|
||||
92FF00DC23AA0B4900AC97F6 /* CPUROIPooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUROIPooling.cpp; sourceTree = "<group>"; };
|
||||
|
|
@ -1252,7 +1241,6 @@
|
|||
92FF01FB23AA0B5200AC97F6 /* CPUSigmoid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSigmoid.cpp; sourceTree = "<group>"; };
|
||||
92FF01FE23AA0B5200AC97F6 /* CPURelu.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPURelu.cpp; sourceTree = "<group>"; };
|
||||
92FF01FF23AA0B5200AC97F6 /* CPUDetectionPostProcess.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDetectionPostProcess.cpp; sourceTree = "<group>"; };
|
||||
92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUPoolGrad.cpp; sourceTree = "<group>"; };
|
||||
92FF020323AA0B5300AC97F6 /* CPUSetDiff1D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUSetDiff1D.cpp; sourceTree = "<group>"; };
|
||||
92FF020623AA0B5300AC97F6 /* CPUMoments.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUMoments.cpp; sourceTree = "<group>"; };
|
||||
92FF020723AA0B5300AC97F6 /* CPULinSpace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPULinSpace.cpp; sourceTree = "<group>"; };
|
||||
|
|
@ -1414,8 +1402,6 @@
|
|||
C43C81D82518944F00A0FF84 /* WinogradHelper.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradHelper.hpp; sourceTree = "<group>"; };
|
||||
C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvInt8_1xN.hpp; sourceTree = "<group>"; };
|
||||
C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvInt8_1xN.cpp; sourceTree = "<group>"; };
|
||||
C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BlstmComputer.cpp; sourceTree = "<group>"; };
|
||||
C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = BlstmComputer.hpp; sourceTree = "<group>"; };
|
||||
C43C81EB2518947700A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8toFloat32_8x4_Common.S; sourceTree = "<group>"; };
|
||||
C43C81EF2518948800A0FF84 /* MNNGemmInt8toFloat32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8toFloat32_8x4_Common.S; sourceTree = "<group>"; };
|
||||
C43C81F02518948800A0FF84 /* MNNGemmint8to32_8x4_Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Common.S; sourceTree = "<group>"; };
|
||||
|
|
@ -1796,8 +1782,6 @@
|
|||
92FF025023AA0B5900AC97F6 /* CPUOPRegister.cpp */,
|
||||
92FF01F123AA0B5200AC97F6 /* CPUPool.cpp */,
|
||||
92FF00F823AA0B4A00AC97F6 /* CPUPool.hpp */,
|
||||
92FF020123AA0B5300AC97F6 /* CPUPoolGrad.cpp */,
|
||||
92FF00D923AA0B4800AC97F6 /* CPUPoolGrad.hpp */,
|
||||
92FF00D723AA0B4800AC97F6 /* CPUPoolInt8.cpp */,
|
||||
92FF00F123AA0B4A00AC97F6 /* CPUPoolInt8.hpp */,
|
||||
92FF010223AA0B4B00AC97F6 /* CPUPriorbox.cpp */,
|
||||
|
|
@ -1898,7 +1882,6 @@
|
|||
children = (
|
||||
489D7A162550FDC800AD896A /* MetalReLU6.metal */,
|
||||
489D7A172550FDC800AD896A /* MetalReduction.hpp */,
|
||||
489D7A182550FDC800AD896A /* MetalDequantize.hpp */,
|
||||
489D7A192550FDC800AD896A /* MetalConvolutionGEMM.hpp */,
|
||||
489D7A1A2550FDC800AD896A /* MetalReLU.metal */,
|
||||
489D7A1D2550FDC800AD896A /* MetalROIPooling.hpp */,
|
||||
|
|
@ -1936,9 +1919,7 @@
|
|||
489D7A3E2550FDC800AD896A /* MetalScale.metal */,
|
||||
489D7A3F2550FDC800AD896A /* MetalConvolution.hpp */,
|
||||
489D7A402550FDC800AD896A /* MetalScale.mm */,
|
||||
489D7A412550FDC800AD896A /* MetalDequantize.metal */,
|
||||
489D7A422550FDC800AD896A /* MetalFixedPoint.metal */,
|
||||
489D7A432550FDC800AD896A /* MetalDequantize.mm */,
|
||||
489D7A442550FDC800AD896A /* MetalMatMul.hpp */,
|
||||
489D7A452550FDC800AD896A /* MetalConvolution1x1.hpp */,
|
||||
489D7A462550FDC800AD896A /* MetalConvolutionDepthwise.hpp */,
|
||||
|
|
@ -2324,8 +2305,6 @@
|
|||
92FF021B23AA0B5600AC97F6 /* compute */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
C43C81DB2518944F00A0FF84 /* BlstmComputer.cpp */,
|
||||
C43C81DC2518944F00A0FF84 /* BlstmComputer.hpp */,
|
||||
C43C81DA2518944F00A0FF84 /* ConvInt8_1xN.cpp */,
|
||||
C43C81D92518944F00A0FF84 /* ConvInt8_1xN.hpp */,
|
||||
C43C81D62518944F00A0FF84 /* ConvInt83x3.cpp */,
|
||||
|
|
@ -2567,12 +2546,10 @@
|
|||
92FF035823AA0B5A00AC97F6 /* CPUTFQuantizedConv2D.hpp in Headers */,
|
||||
92FF027223AA0B5A00AC97F6 /* CPUScatterNd.hpp in Headers */,
|
||||
489D7A902550FDC900AD896A /* MetalConvolution.hpp in Headers */,
|
||||
92FF025B23AA0B5A00AC97F6 /* CPUPoolGrad.hpp in Headers */,
|
||||
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */,
|
||||
92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */,
|
||||
488F1159247BB2A0008E85C6 /* Arm82Raster.hpp in Headers */,
|
||||
92FF036623AA0B5A00AC97F6 /* CPUDetectionOutput.hpp in Headers */,
|
||||
489D7A692550FDC800AD896A /* MetalDequantize.hpp in Headers */,
|
||||
92FF04BC23AA0BFB00AC97F6 /* NonCopyable.hpp in Headers */,
|
||||
48FA474B23AA127B00172C3B /* Utils.hpp in Headers */,
|
||||
C43C82302518951800A0FF84 /* ImageFloatBlitter.hpp in Headers */,
|
||||
|
|
@ -2584,7 +2561,6 @@
|
|||
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */,
|
||||
92FF03C923AA0B5A00AC97F6 /* CPUMatMul.hpp in Headers */,
|
||||
EBECA39924643D320062C7A3 /* Arm82Relu.hpp in Headers */,
|
||||
C43C81E42518944F00A0FF84 /* BlstmComputer.hpp in Headers */,
|
||||
92FF03B223AA0B5A00AC97F6 /* ConvolutionInt8Executor.hpp in Headers */,
|
||||
92FF03A523AA0B5A00AC97F6 /* DeconvolutionWithStride.hpp in Headers */,
|
||||
489D7A7F2550FDC900AD896A /* MetalReLU.hpp in Headers */,
|
||||
|
|
@ -2839,7 +2815,6 @@
|
|||
92FF034023AA0B5A00AC97F6 /* CPUShape.cpp in Sources */,
|
||||
92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
|
||||
92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
|
||||
C43C81E32518944F00A0FF84 /* BlstmComputer.cpp in Sources */,
|
||||
92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
|
||||
48A8A61321D101A700C2B9A7 /* ImageSampler.cpp in Sources */,
|
||||
92FF02D123AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
|
||||
|
|
@ -2874,7 +2849,6 @@
|
|||
489D7AA42550FDC900AD896A /* MetalROIPooling.mm in Sources */,
|
||||
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */,
|
||||
489D7A772550FDC800AD896A /* MetalConvolutionGEMM.mm in Sources */,
|
||||
489D7A942550FDC900AD896A /* MetalDequantize.mm in Sources */,
|
||||
92FF031623AA0B5A00AC97F6 /* MNNMatrixMax.S in Sources */,
|
||||
92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
|
||||
489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
|
||||
|
|
@ -2983,7 +2957,6 @@
|
|||
48747D64245D9E33000B9709 /* GeometryTile.cpp in Sources */,
|
||||
92FF043723AA0B7100AC97F6 /* ShapeDetectionOutput.cpp in Sources */,
|
||||
92FF042623AA0B7100AC97F6 /* ShapeCosineSimilarity.cpp in Sources */,
|
||||
489D7A922550FDC900AD896A /* MetalDequantize.metal in Sources */,
|
||||
92FF02DC23AA0B5A00AC97F6 /* MNNReluInt8.S in Sources */,
|
||||
92FF041A23AA0B7100AC97F6 /* ShapeFill.cpp in Sources */,
|
||||
EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
|
||||
|
|
@ -2994,7 +2967,6 @@
|
|||
92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */,
|
||||
92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
|
||||
92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
|
||||
92FF038023AA0B5A00AC97F6 /* CPUPoolGrad.cpp in Sources */,
|
||||
48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */,
|
||||
92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */,
|
||||
92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
|
||||
|
|
|
|||
|
|
@ -15,6 +15,11 @@ def usage():
|
|||
print(" [--MNNModel MNNMODEL]")
|
||||
print(" [--fp16 {True,False}]")
|
||||
print(" [--weightQuantBits {num of bits for weight-only-quant, default:0, which means no quant}]")
|
||||
print(" [--weightQuantAsymmetric {True,False use asymmetric quant method for weight-only-quant, \
|
||||
the default method is symmetric quant, which is compatible with old MNN versions. \
|
||||
you can set this flag to True use asymmetric quant method to improve accuracy of the weight-quant model in some cases, \
|
||||
but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. \
|
||||
default: False, which means using SYMMETRIC quant method}]")
|
||||
print(" [--compressionParamsFile COMPRESSION_PARAMS_PATH]")
|
||||
|
||||
def main():
|
||||
|
|
@ -35,6 +40,7 @@ def main():
|
|||
Boolean to change the mnn usage. If True, the output\
|
||||
model save data in half_float type")
|
||||
parser.add_argument("--weightQuantBits", type=int, default=0)
|
||||
parser.add_argument("--weightQuantAsymmetric", type=bool, default=False)
|
||||
parser.add_argument("--compressionParamsFile", type=str, default=None,
|
||||
help="The path of model compression file that stores the int8 calibration \
|
||||
table for quantization or auxiliary parameters for sparsity.")
|
||||
|
|
@ -80,7 +86,7 @@ def main():
|
|||
args.compressionParamsFile = ""
|
||||
|
||||
Tools.mnnconvert(args.MNNModel, args. modelFile, framework_type,\
|
||||
args.fp16, args.prototxt, args.weightQuantBits, args.compressionParamsFile)
|
||||
args.fp16, args.prototxt, args.weightQuantBits, args.weightQuantAsymmetric, args.compressionParamsFile)
|
||||
return 0
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -2,6 +2,12 @@
|
|||
# Created by ruhuan on 2019.08.31
|
||||
""" build wheel tool """
|
||||
from __future__ import print_function
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='build pymnn wheel')
|
||||
parser.add_argument('--x86', dest='x86', action='store_true', default=False,
|
||||
help='build wheel for 32bit arch, only usable on windows')
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import platform
|
||||
|
|
@ -19,4 +25,4 @@ if __name__ == '__main__':
|
|||
if IS_WINDOWS:
|
||||
os.putenv('DISTUTILS_USE_SDK', '1')
|
||||
os.putenv('MSSdk', '1')
|
||||
os.system('python setup.py bdist_wheel')
|
||||
os.system('python setup.py bdist_wheel %s' % ('--x86' if args.x86 else ''))
|
||||
|
|
|
|||
|
|
@ -18,7 +18,10 @@ IS_DARWIN = (platform.system() == 'Darwin')
|
|||
IS_LINUX = (platform.system() == 'Linux')
|
||||
BUILD_DIR = 'pymnn_build'
|
||||
BUILD_TYPE = 'RELEASE'
|
||||
BUILD_ARCH = 'x64' # x64 or x86
|
||||
BUILD_ARCH = 'x64'
|
||||
if '--x86' in sys.argv:
|
||||
BUILD_ARCH = ''
|
||||
sys.argv.remove('--x86')
|
||||
|
||||
def check_env_flag(name, default=''):
|
||||
""" check whether a env is set to Yes """
|
||||
|
|
@ -43,7 +46,7 @@ if os.path.isdir('../../schema/private'):
|
|||
|
||||
print ('Building with python wheel with package name ', package_name)
|
||||
|
||||
version = '1.0.11'
|
||||
version = '1.1.0'
|
||||
depend_pip_packages = ['flatbuffers', 'numpy']
|
||||
if package_name == 'MNN':
|
||||
README = os.path.join(os.getcwd(), "README.md")
|
||||
|
|
@ -172,6 +175,8 @@ def configure_extension_build():
|
|||
tools_include_dirs += [os.path.join(root_dir, "source", "core")]
|
||||
tools_include_dirs += [os.path.join(root_dir, "schema", "current")]
|
||||
tools_include_dirs += [os.path.join(root_dir, "source")]
|
||||
if IS_WINDOWS:
|
||||
tools_include_dirs += [os.path.join(os.environ['Protobuf_SRC_ROOT_FOLDER'], 'src')]
|
||||
|
||||
tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lz']
|
||||
|
||||
|
|
|
|||
|
|
@ -30,9 +30,10 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
|
|||
PyObject* frameworkType = NULL;
|
||||
PyObject* fp16 = NULL;
|
||||
PyObject* weightQuantBits = NULL;
|
||||
if (!PyArg_ParseTuple(args, "ssOO|sOs", &mnnModel, &modelFile,
|
||||
PyObject* weightQuantAsymmetric = NULL;
|
||||
if (!PyArg_ParseTuple(args, "ssOO|sOOs", &mnnModel, &modelFile,
|
||||
&frameworkType, &fp16, &prototxtFile,
|
||||
&weightQuantBits, &compressionParamsFile)) {
|
||||
&weightQuantBits, &weightQuantAsymmetric, &compressionParamsFile)) {
|
||||
return NULL;
|
||||
}
|
||||
struct modelConfig modelPath;
|
||||
|
|
@ -44,6 +45,7 @@ static PyObject* PyTool_Converter(PyObject *self, PyObject *args) {
|
|||
modelPath.saveHalfFloat = static_cast<bool>(PyLong_AsLong(fp16));
|
||||
modelPath.forTraining = false;
|
||||
modelPath.weightQuantBits = static_cast<int>(PyLong_AsLong(weightQuantBits));
|
||||
modelPath.weightQuantAsymmetric = static_cast<bool>(PyLong_AsLong(weightQuantAsymmetric));
|
||||
if(prototxtFile){
|
||||
modelPath.prototxtFile = std::string(prototxtFile);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -214,9 +214,10 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
backend()->onCopyBuffer(&mOutputBuffer, output);
|
||||
} else {
|
||||
float *dstOrigin = output->host<float>();
|
||||
int outMaxValNum = mOutMaxVal + 1;
|
||||
for (int i = 0; i < mNum; ++i) {
|
||||
float *iptr = srcOrigin + i * mDim * mKeyExtent;
|
||||
float *optr = dstOrigin + i * mKeyExtent;
|
||||
float *optr = dstOrigin + i * mKeyExtent * mTopk * outMaxValNum;
|
||||
|
||||
for (int k = 0; k < mKeyExtent; ++k) {
|
||||
// apply threshold
|
||||
|
|
@ -239,9 +240,9 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
// copy index
|
||||
for (int j = 0; j < mTopk; ++j) {
|
||||
if (j < sortDim) {
|
||||
optr[k + j*mKeyExtent] = element_index(vec[j]);
|
||||
optr[k * outMaxValNum * mTopk + j] = element_index(vec[j]);
|
||||
} else {
|
||||
optr[k + j*mKeyExtent] = 0.f;
|
||||
optr[k * outMaxValNum * mTopk + j] = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -249,9 +250,9 @@ ErrorCode CPUArgMax::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
if (mOutMaxVal) {
|
||||
for (int j = 0; j < mTopk; ++j) {
|
||||
if (j < sortDim) {
|
||||
optr[k + j*mKeyExtent] = element_value(vec[j]);
|
||||
optr[k * outMaxValNum * mTopk + mTopk + j] = element_value(vec[j]);
|
||||
} else {
|
||||
optr[k + j*mKeyExtent] = 0.f;
|
||||
optr[k * outMaxValNum * mTopk + mTopk + j] = 0.f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -264,7 +264,7 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
if (biasLength % 4 != 0) {
|
||||
// Padding to align of 4
|
||||
biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, 4) * 4}));
|
||||
bool res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
|
||||
res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,6 @@ extern void ___CPUSizeCreator__OpType_Size__();
|
|||
extern void ___CPUUnravelIndexCreator__OpType_UnravelIndex__();
|
||||
extern void ___CPUMatMulCreator__OpType_MatMul__();
|
||||
extern void ___CPUMomentsCreator__OpType_Moments__();
|
||||
extern void ___CPUPoolGradCreator__OpType_PoolGrad__();
|
||||
extern void ___CPUInstanceNormCreator__OpType_InstanceNorm__();
|
||||
extern void ___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
|
||||
extern void ___CPUWhereCreator__OpType_Where__();
|
||||
|
|
@ -93,7 +92,6 @@ ___CPUSizeCreator__OpType_Size__();
|
|||
___CPUUnravelIndexCreator__OpType_UnravelIndex__();
|
||||
___CPUMatMulCreator__OpType_MatMul__();
|
||||
___CPUMomentsCreator__OpType_Moments__();
|
||||
___CPUPoolGradCreator__OpType_PoolGrad__();
|
||||
___CPUInstanceNormCreator__OpType_InstanceNorm__();
|
||||
___CPUQuantizedLogisticCreator__OpType_QuantizedLogistic__();
|
||||
___CPUWhereCreator__OpType_Where__();
|
||||
|
|
|
|||
|
|
@ -63,6 +63,8 @@ struct QuanPostTreatParameters {
|
|||
const int32_t* bias;
|
||||
int32_t maxValue;
|
||||
int32_t minValue;
|
||||
float roundValuePos = 0.5f;
|
||||
float roundValueNeg = -0.5f;
|
||||
};
|
||||
void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
|
||||
void MNNGemmInt8AddBiasScale_16x4_Unit_FAST(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ void MNNFunctionInit() {
|
|||
gFunc.MNNPackC4ForMatMul_A = _AVX_MNNPackC4ForMatMul_A;
|
||||
gFunc.MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
|
||||
gFunc.MNNGemmInt8AddBiasScale_16x4_Unit = _AVX_MNNGemmInt8AddBiasScale_16x4_Unit;
|
||||
gFunc.MNNExpC8 = _AVX_MNNExpC8;
|
||||
if (cpuFlags & libyuv::kCpuHasFMA3) {
|
||||
gFunc.MNNGemmFloatUnit_4 = _AVX_MNNGemmFloatUnitFMA_4;
|
||||
gFunc.MNNGemmFloatCommon_4 = _AVX_MNNGemmFloatCommonFMA_4;
|
||||
|
|
@ -323,28 +324,6 @@ void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSiz
|
|||
float* cache, const float* postParameters, const float* bias) {
|
||||
return gFunc.MNNPackedMatMulRemain(C, A, B, eSize, parameter, cache, postParameters, bias);
|
||||
}
|
||||
/**
|
||||
void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
|
||||
auto count = countC8 * 8;
|
||||
auto param = parameters[0];
|
||||
float xLimit = 87;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
auto x = -source[i];
|
||||
x = ALIMAX(x, -xLimit);
|
||||
x = ALIMIN(x, xLimit);
|
||||
int div = (x * parameters[1]);
|
||||
int div2 = (div + 127) << 23;
|
||||
auto xReamin = x - div * param;
|
||||
float expBasic = *(float*)(&div2);
|
||||
auto t = xReamin;
|
||||
auto expRemain =
|
||||
((((parameters[7] * t + parameters[6]) * t + parameters[5]) * t + parameters[4]) * t + parameters[3]) * t +
|
||||
parameters[2];
|
||||
dest[i] = expBasic * expRemain;
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
|
||||
gFunc.MNNExpC8(dest, source, parameters, countC8);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,6 +124,48 @@ static void _postTreat(float* C, size_t eSize, const size_t* parameter, const fl
|
|||
}
|
||||
}
|
||||
|
||||
void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
|
||||
auto count = countC8;
|
||||
auto p0 = _mm256_set1_ps(parameters[0]);
|
||||
auto p1 = _mm256_set1_ps(parameters[1]);
|
||||
auto p2 = _mm256_set1_ps(parameters[2]);
|
||||
auto p3 = _mm256_set1_ps(parameters[3]);
|
||||
auto p4 = _mm256_set1_ps(parameters[4]);
|
||||
auto p5 = _mm256_set1_ps(parameters[5]);
|
||||
auto p6 = _mm256_set1_ps(parameters[6]);
|
||||
auto p7 = _mm256_set1_ps(parameters[7]);
|
||||
auto xMax = _mm256_set1_ps(87);
|
||||
auto xMin = _mm256_set1_ps(-87);
|
||||
auto basic = _mm256_set1_epi32(1 << 23);
|
||||
auto temp127 = _mm256_set1_epi32(127);
|
||||
auto negZero = _mm256_set1_ps(-0.f);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
auto x = _mm256_xor_ps(_mm256_loadu_ps(source + i * 8), negZero);
|
||||
x = _mm256_max_ps(x, xMin);
|
||||
x = _mm256_min_ps(x, xMax);
|
||||
auto div = _mm256_mul_ps(x, p1);
|
||||
auto divInt = _mm256_cvtps_epi32(div);
|
||||
div = _mm256_cvtepi32_ps(divInt);
|
||||
auto div2 = _mm256_add_epi32(divInt, temp127);
|
||||
div2 = _mm256_mullo_epi32(div2, basic);
|
||||
auto expBasic = _mm256_castsi256_ps(div2);
|
||||
auto xReamin = _mm256_sub_ps(x, _mm256_mul_ps(div, p0));
|
||||
auto t = xReamin;
|
||||
auto c0 = _mm256_mul_ps(p7, t);
|
||||
auto c1 = _mm256_add_ps(c0, p6);
|
||||
auto c2 = _mm256_mul_ps(c1, t);
|
||||
auto c3 = _mm256_add_ps(c2, p5);
|
||||
auto c4 = _mm256_mul_ps(c3, t);
|
||||
auto c5 = _mm256_add_ps(c4, p4);
|
||||
auto c6 = _mm256_mul_ps(c5, t);
|
||||
auto c7 = _mm256_add_ps(c6, p3);
|
||||
auto c8 = _mm256_mul_ps(c7, t);
|
||||
auto c9 = _mm256_add_ps(c8, p2);
|
||||
auto expRemain = c9;
|
||||
_mm256_storeu_ps(dest + 8 * i, _mm256_mul_ps(expBasic, expRemain));
|
||||
}
|
||||
}
|
||||
|
||||
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||
size_t srcHStep, size_t dstHStep) {
|
||||
|
|
|
|||
|
|
@ -90,4 +90,6 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
|||
size_t srcHStep, size_t dstHStep);
|
||||
void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post);
|
||||
|
||||
void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -203,8 +203,19 @@ void AVX2GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const fl
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MNN_X86_USE_ASM
|
||||
extern "C" {
|
||||
void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
|
||||
}
|
||||
#endif
|
||||
void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post) {
|
||||
#ifdef MNN_X86_USE_ASM
|
||||
size_t strides[3];
|
||||
strides[0] = src_depth_quad;
|
||||
strides[1] = dst_step;
|
||||
strides[2] = dst_depth_quad;
|
||||
_AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(dst, src, weight, strides, post);
|
||||
#else
|
||||
const auto dst_step_tmp = dst_step / sizeof(int8_t);
|
||||
__m128 zero128 = _mm_set1_ps(0.0f);
|
||||
__m128 minValue = _mm_set1_ps(post->minValue);
|
||||
|
|
@ -356,4 +367,5 @@ auto d##i = _mm_add_epi32(d##i##0, d##i##1);
|
|||
d0 = _mm_packs_epi16(d0, d2);
|
||||
_mm_storeu_ps((float*)dst_x, _mm_castsi128_ps(d0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,309 @@
|
|||
//
|
||||
// _AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2020/11/04.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "../MNNAsmGlobal.h"
|
||||
.text
|
||||
.align 4
|
||||
|
||||
//struct QuanPostTreatParameters {
|
||||
// const float* scale;
|
||||
// const int32_t* bias;
|
||||
// int32_t maxValue;
|
||||
// int32_t minValue;
|
||||
// float roundValuePos = 0.5f;
|
||||
// float roundValueNeg = -0.5f;
|
||||
//};
|
||||
|
||||
asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
|
||||
//void _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain(int8_t* dst, const int8_t* src, const int8_t* weight, const size_t* strides, const QuanPostTreatParameters* post);
|
||||
|
||||
|
||||
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
|
||||
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#ifdef WIN32
|
||||
movq 48(%rsp), %r10
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
movq %r9, %rcx
|
||||
movq %r10, %r9
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
#else
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
movq %r8, %r9
|
||||
#endif
|
||||
|
||||
movq 8(%rcx), %r10 // dst_step
|
||||
movq 16(%rcx), %r8 // dst_depth_quad
|
||||
movq (%rcx), %rcx // src_depth_quad
|
||||
movq (%r9), %r12 // scale
|
||||
movq 8(%r9), %r15 // bias
|
||||
|
||||
|
||||
// ymm0-ymm1: Src
|
||||
// ymm2-ymm3: Weight
|
||||
// ymm4-ymm7: TmpDst
|
||||
// ymm8-ymm15: Dst Sum
|
||||
|
||||
// Last dst save to ymm8-ymm11
|
||||
|
||||
cmpq $0, %r8
|
||||
je End
|
||||
|
||||
movq %rsi, %r13
|
||||
subq $64, %rsp
|
||||
LoopDz:
|
||||
movq %rcx, %r11
|
||||
movq %r13, %rsi
|
||||
movq %rdx, %r14
|
||||
subq $1, %r11
|
||||
vpmovsxbw (%rsi), %ymm0
|
||||
vpmovsxbw 16(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm8
|
||||
vpmaddwd %ymm0, %ymm3, %ymm9
|
||||
vpmaddwd %ymm1, %ymm2, %ymm12
|
||||
vpmaddwd %ymm1, %ymm3, %ymm13
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm10
|
||||
vpmaddwd %ymm0, %ymm3, %ymm11
|
||||
vpmaddwd %ymm1, %ymm2, %ymm14
|
||||
vpmaddwd %ymm1, %ymm3, %ymm15
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
testq %r11, %r11
|
||||
je FirstLoopSzEnd
|
||||
|
||||
FirstLoopSz:
|
||||
vpmovsxbw (%rsi), %ymm0
|
||||
vpmovsxbw 16(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm8, %ymm8
|
||||
vpaddd %ymm5, %ymm9, %ymm9
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
vpaddd %ymm6, %ymm12, %ymm12
|
||||
vpaddd %ymm7, %ymm13, %ymm13
|
||||
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm10, %ymm10
|
||||
vpaddd %ymm5, %ymm11, %ymm11
|
||||
vpaddd %ymm6, %ymm14, %ymm14
|
||||
vpaddd %ymm7, %ymm15, %ymm15
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
jne FirstLoopSz
|
||||
|
||||
FirstLoopSzEnd:
|
||||
|
||||
vphaddd %ymm9, %ymm8, %ymm8
|
||||
vphaddd %ymm11, %ymm10, %ymm10
|
||||
vphaddd %ymm13, %ymm12, %ymm12
|
||||
vphaddd %ymm15, %ymm14, %ymm14
|
||||
|
||||
vphaddd %ymm10, %ymm8, %ymm8
|
||||
vphaddd %ymm14, %ymm12, %ymm9
|
||||
|
||||
vmovups %ymm8, (%rsp)
|
||||
vmovups %ymm9, 32(%rsp)
|
||||
|
||||
movq %rcx, %r11
|
||||
movq %r13, %rsi
|
||||
movq %r14, %rdx
|
||||
vpmovsxbw 32(%rsi), %ymm0
|
||||
vpmovsxbw 48(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm8
|
||||
vpmaddwd %ymm0, %ymm3, %ymm9
|
||||
vpmaddwd %ymm1, %ymm2, %ymm12
|
||||
vpmaddwd %ymm1, %ymm3, %ymm13
|
||||
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm10
|
||||
vpmaddwd %ymm0, %ymm3, %ymm11
|
||||
vpmaddwd %ymm1, %ymm2, %ymm14
|
||||
vpmaddwd %ymm1, %ymm3, %ymm15
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
je SecondLoopSzEnd
|
||||
|
||||
SecondLoopSz:
|
||||
vpmovsxbw 32(%rsi), %ymm0
|
||||
vpmovsxbw 48(%rsi), %ymm1
|
||||
vpmovsxbw (%rdx), %ymm2
|
||||
vpmovsxbw 16(%rdx), %ymm3
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm8, %ymm8
|
||||
vpaddd %ymm5, %ymm9, %ymm9
|
||||
vpmovsxbw 32(%rdx), %ymm2
|
||||
vpmovsxbw 48(%rdx), %ymm3
|
||||
vpaddd %ymm6, %ymm12, %ymm12
|
||||
vpaddd %ymm7, %ymm13, %ymm13
|
||||
|
||||
|
||||
vpmaddwd %ymm0, %ymm2, %ymm4
|
||||
vpmaddwd %ymm0, %ymm3, %ymm5
|
||||
vpmaddwd %ymm1, %ymm2, %ymm6
|
||||
vpmaddwd %ymm1, %ymm3, %ymm7
|
||||
vpaddd %ymm4, %ymm10, %ymm10
|
||||
vpaddd %ymm5, %ymm11, %ymm11
|
||||
vpaddd %ymm6, %ymm14, %ymm14
|
||||
vpaddd %ymm7, %ymm15, %ymm15
|
||||
|
||||
addq $64, %rdx
|
||||
addq $64, %rsi
|
||||
|
||||
subq $1, %r11
|
||||
testq %r11, %r11
|
||||
jne SecondLoopSz
|
||||
SecondLoopSzEnd:
|
||||
|
||||
vphaddd %ymm9, %ymm8, %ymm8
|
||||
vphaddd %ymm11, %ymm10, %ymm10
|
||||
vphaddd %ymm13, %ymm12, %ymm12
|
||||
vphaddd %ymm15, %ymm14, %ymm14
|
||||
|
||||
vphaddd %ymm10, %ymm8, %ymm10
|
||||
vphaddd %ymm14, %ymm12, %ymm11
|
||||
|
||||
vmovups (%rsp), %ymm8
|
||||
vmovups 32(%rsp), %ymm9
|
||||
|
||||
Last:
|
||||
.macro TRANSPOSE x0, x1, x2, x3
|
||||
// 32 = 0 + 16 * 2: frist 128 x0_lo, second 128 x1_lo
|
||||
// 49 = 1 + 16 * 3: frist 128 x0_hi, second 128 x1_hi
|
||||
vperm2f128 $32, \x1, \x0, \x2
|
||||
vperm2f128 $49, \x1, \x0, \x3
|
||||
.endm
|
||||
TRANSPOSE %ymm8, %ymm10, %ymm0, %ymm1
|
||||
TRANSPOSE %ymm9, %ymm11, %ymm2, %ymm3
|
||||
|
||||
vpaddd %ymm0, %ymm1, %ymm0
|
||||
vpaddd %ymm2, %ymm3, %ymm2
|
||||
|
||||
vbroadcastf128 (%r12), %ymm8
|
||||
vbroadcastf128 (%r15), %ymm9
|
||||
|
||||
vpaddd %ymm9, %ymm0, %ymm0
|
||||
vpaddd %ymm9, %ymm2, %ymm2
|
||||
|
||||
vcvtdq2ps %ymm0, %ymm0
|
||||
vcvtdq2ps %ymm2, %ymm2
|
||||
|
||||
vmulps %ymm8, %ymm0, %ymm0
|
||||
vmulps %ymm8, %ymm2, %ymm2
|
||||
// zero
|
||||
vxorps %ymm13, %ymm13, %ymm13
|
||||
|
||||
vbroadcastss 24(%r9), %ymm14
|
||||
vbroadcastss 28(%r9), %ymm15
|
||||
vbroadcastss 16(%r9), %ymm10
|
||||
vbroadcastss 20(%r9), %ymm11
|
||||
|
||||
// Round
|
||||
vcmpltps %ymm13, %ymm0, %ymm4
|
||||
vcmpltps %ymm13, %ymm2, %ymm5
|
||||
|
||||
vblendvps %ymm4, %ymm15, %ymm14, %ymm4
|
||||
vblendvps %ymm5, %ymm15, %ymm14, %ymm5
|
||||
|
||||
vaddps %ymm0, %ymm4, %ymm0
|
||||
vaddps %ymm2, %ymm5, %ymm2
|
||||
|
||||
// 3: ROUND to Zero
|
||||
vroundps $3, %ymm0, %ymm0
|
||||
vroundps $3, %ymm2, %ymm2
|
||||
vcvtps2dq %ymm0, %ymm0
|
||||
vcvtps2dq %ymm2, %ymm2
|
||||
|
||||
vpminsd %ymm10, %ymm0, %ymm0
|
||||
vpminsd %ymm10, %ymm2, %ymm2
|
||||
|
||||
vpmaxsd %ymm11, %ymm0, %ymm0
|
||||
vpmaxsd %ymm11, %ymm2, %ymm2
|
||||
|
||||
vpackssdw %ymm2, %ymm0, %ymm0
|
||||
vperm2f128 $1, %ymm0, %ymm0, %ymm1
|
||||
vpacksswb %ymm1, %ymm0, %ymm0
|
||||
|
||||
addq $16, %r12
|
||||
addq $16, %r15
|
||||
|
||||
vmovups %xmm0, (%rdi)
|
||||
addq %r10, %rdi
|
||||
|
||||
subq $1, %r8
|
||||
testq %r8, %r8
|
||||
jne LoopDz
|
||||
addq $64, %rsp
|
||||
|
||||
End:
|
||||
|
||||
#ifdef WIN32
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbp
|
||||
#else
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rbp
|
||||
#endif
|
||||
|
||||
// FIXME: if don't vzeroall, it will cause other op slow
|
||||
vzeroall
|
||||
retq
|
||||
|
||||
|
|
@ -14,18 +14,7 @@
|
|||
|
||||
#if MNN_METAL_ENABLED
|
||||
|
||||
#if MNN_METAL_DEBUG && MNN_METAL_BENCHMARK
|
||||
#define MNN_PRINT_ENCODER(context, encoder) \
|
||||
{ \
|
||||
[context printEncoder:encoder]; \
|
||||
[context commit]; \
|
||||
[context wait]; \
|
||||
}
|
||||
#elif MNN_METAL_DEBUG
|
||||
#define MNN_PRINT_ENCODER(context, encoder) [context printEncoder:encoder];
|
||||
#else
|
||||
#define MNN_PRINT_ENCODER(context, encoder) ((void)0)
|
||||
#endif
|
||||
|
||||
namespace MNN {
|
||||
typedef enum {
|
||||
|
|
@ -121,6 +110,10 @@ typedef struct {
|
|||
threads:(MTLSize)threads
|
||||
threadsPerGroup:(MTLSize)threadsPerGroup
|
||||
bandwidth:(MNN::MetalBandwidth)bandwidth;
|
||||
- (id<MTLComputePipelineState>)pipelineWithName:(NSString *)name;
|
||||
- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) pipeline threads:(MTLSize)threads;
|
||||
|
||||
- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t;
|
||||
|
||||
#if MNN_METAL_DEBUG
|
||||
/**
|
||||
|
|
@ -147,6 +140,8 @@ typedef struct {
|
|||
* @brief print encoder
|
||||
*/
|
||||
- (void)printEncoder:(id<MTLCommandEncoder>)encoder;
|
||||
|
||||
|
||||
#endif
|
||||
@end
|
||||
|
||||
|
|
|
|||
|
|
@ -176,10 +176,7 @@ using namespace MNN;
|
|||
}
|
||||
|
||||
- (void)wait {
|
||||
NSArray *buffers = _waitings.copy;
|
||||
[_waitings removeAllObjects];
|
||||
|
||||
for (id<MTLCommandBuffer> buffer in buffers) {
|
||||
for (id<MTLCommandBuffer> buffer in _waitings) {
|
||||
if (buffer.status >= MTLCommandBufferStatusCompleted)
|
||||
continue;
|
||||
|
||||
|
|
@ -204,6 +201,7 @@ using namespace MNN;
|
|||
}
|
||||
#endif
|
||||
}
|
||||
[_waitings removeAllObjects];
|
||||
}
|
||||
|
||||
static NSUInteger smallest_log2(NSUInteger integer) {
|
||||
|
|
@ -217,6 +215,83 @@ static NSUInteger smallest_log2(NSUInteger integer) {
|
|||
return power;
|
||||
}
|
||||
|
||||
- (std::pair<MTLSize, MTLSize>)computeBestGroupAndLocal:(id<MTLComputePipelineState>) bw threads:(MTLSize)t {
|
||||
auto local = [self computeBestGroup:bw threads:t];
|
||||
auto globalSize = MTLSizeMake(UP_DIV(t.width, local.width), UP_DIV(t.height, local.height), UP_DIV(t.depth, local.depth));
|
||||
return std::make_pair(globalSize, local);
|
||||
}
|
||||
|
||||
- (MTLSize)computeBestGroup:(id<MTLComputePipelineState>) bw threads:(MTLSize)t {
|
||||
if (bw.maxTotalThreadsPerThreadgroup > 64) {
|
||||
auto res = MTLSizeMake(8, 8, 8);
|
||||
int reduceNumber = 0;
|
||||
if (t.depth < 4) {
|
||||
res.depth = 1;
|
||||
reduceNumber++;
|
||||
}
|
||||
if (t.width < 4) {
|
||||
res.width = 1;
|
||||
reduceNumber++;
|
||||
}
|
||||
if (t.height < 4) {
|
||||
res.height = 1;
|
||||
reduceNumber++;
|
||||
}
|
||||
if (reduceNumber == 0) {
|
||||
return MTLSizeMake(4, 4, 4);
|
||||
}
|
||||
if (reduceNumber == 2) {
|
||||
if (res.width > 1) {
|
||||
res.width = 64;
|
||||
}
|
||||
if (res.height > 1) {
|
||||
res.height = 64;
|
||||
}
|
||||
if (res.depth > 1) {
|
||||
res.depth = 64;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
auto pwarp = smallest_log2(bw.threadExecutionWidth);
|
||||
auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));
|
||||
auto py = smallest_log2(t.height), sy = (NSUInteger)ceil(log2(t.height));
|
||||
|
||||
// accurately match on x
|
||||
if (px >= pwarp) {
|
||||
return {bw.threadExecutionWidth, 1, 1};
|
||||
}
|
||||
// accurately match on xy
|
||||
else if (px + py >= pwarp && sx < pwarp / 2) {
|
||||
NSUInteger x = pow(2, px);
|
||||
return {x, bw.threadExecutionWidth / x, 1};
|
||||
}
|
||||
// similarly match on x
|
||||
else if (sx >= pwarp) {
|
||||
return {bw.threadExecutionWidth, 1, 1};
|
||||
}
|
||||
// similarly match on xy
|
||||
else if (sx + sy >= pwarp) {
|
||||
NSUInteger x = pow(2, sx);
|
||||
return {x, bw.threadExecutionWidth / x, 1};
|
||||
}
|
||||
|
||||
// on xyz (for most shaders do not protect gid.z, z axis must be accurately match)
|
||||
auto pz = smallest_log2(t.depth);
|
||||
auto sz = pz;
|
||||
if (px + py + pz >= pwarp) {
|
||||
NSUInteger x = pow(2, px), y = pow(2, py);
|
||||
return {x, y, bw.threadExecutionWidth / x / y};
|
||||
} else if (sx + sy + sz >= pwarp) {
|
||||
NSUInteger x = pow(2, sx), z = pow(2, MIN(sz, pwarp - sx));
|
||||
return {x, bw.threadExecutionWidth / x / z, z};
|
||||
} else {
|
||||
NSUInteger z = pow(2, sz);
|
||||
return {t.width, t.height, z};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
- (MTLSize)threadsPerGroupWithThreads:(MTLSize)t bandwidth:(MetalBandwidth)bw {
|
||||
auto pwarp = smallest_log2(bw.threadExecutionWidth);
|
||||
auto px = smallest_log2(t.width), sx = (NSUInteger)ceil(log2(t.width));
|
||||
|
|
|
|||
|
|
@ -27,11 +27,14 @@ public:
|
|||
id<MTLBuffer> alloc(size_t size, bool seperate = false);
|
||||
void release(id<MTLBuffer> buffer);
|
||||
void clear();
|
||||
float computeSizeInMB() const;
|
||||
private:
|
||||
std::map<id<MTLBuffer>, size_t> mAllocated;
|
||||
std::multimap<size_t, id<MTLBuffer>> mReusableBuffers;
|
||||
void* mContext = nullptr;
|
||||
};
|
||||
virtual float onGetMemoryInMB() override;
|
||||
|
||||
MetalRuntime();
|
||||
virtual ~ MetalRuntime();
|
||||
virtual Backend* onCreate() const override;
|
||||
|
|
@ -115,18 +118,21 @@ public:
|
|||
* @param dstTensor destined tensor
|
||||
* @param encoder command encoder
|
||||
*/
|
||||
virtual void onCopyBuffer(const Tensor *srcTensor, const Tensor *dstTensor,
|
||||
id<MTLComputeCommandEncoder> encoder) const;
|
||||
void onCopyBuffer(const Tensor *srcTensor, const Tensor *dstTensor,
|
||||
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
|
||||
|
||||
void flushEncoder() const;
|
||||
id<MTLComputeCommandEncoder> encoder() const;
|
||||
private:
|
||||
const MetalRuntime* mRuntime;
|
||||
std::vector<id<MTLBuffer>> mHoldBuffers;
|
||||
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
|
||||
|
||||
private:
|
||||
id<MTLBuffer> getHostBuffer(size_t size) const;
|
||||
void onCopyHostToDevice(const Tensor *src, const Tensor *dst) const;
|
||||
void onCopyDeviceToHost(const Tensor *src, const Tensor *dst) const;
|
||||
void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder) const;
|
||||
void onCopyDeviceToDevice(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -223,17 +223,17 @@ struct SamplerInfo {
|
|||
uint4 extent;//dstStride[3]+dstOffset
|
||||
uint4 imageSize;
|
||||
};
|
||||
kernel void blit_float(const device ftype *in [[buffer(0)]],
|
||||
device ftype *out [[buffer(1)]],
|
||||
|
||||
kernel void blit_intx4(const device int4 *in [[buffer(0)]],
|
||||
device int4 *out [[buffer(1)]],
|
||||
constant SamplerInfo &info [[buffer(2)]],
|
||||
uint3 gid [[thread_position_in_grid]]) {
|
||||
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
|
||||
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
|
||||
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
|
||||
out[int(dstOffset)] = in[int(srcOffset)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void blit_int(const device int *in [[buffer(0)]],
|
||||
device int *out [[buffer(1)]],
|
||||
constant SamplerInfo &info [[buffer(2)]],
|
||||
|
|
@ -265,3 +265,14 @@ kernel void blit_int16(const device short *in [[buffer(0)]],
|
|||
out[int(dstOffset)] = in[int(srcOffset)];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void blit_int64(const device short4 *in [[buffer(0)]],
|
||||
device short4 *out [[buffer(1)]],
|
||||
constant SamplerInfo &info [[buffer(2)]],
|
||||
uint3 gid [[thread_position_in_grid]]) {
|
||||
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
|
||||
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
|
||||
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
|
||||
out[int(dstOffset)] = in[int(srcOffset)];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,6 +23,20 @@ MetalRuntime::BufferAllocator::BufferAllocator(void* context) {
|
|||
MetalRuntime::BufferAllocator::~ BufferAllocator() {
|
||||
// Do nothing
|
||||
}
|
||||
float MetalRuntime::BufferAllocator::computeSizeInMB() const {
|
||||
float total = 0.0f;
|
||||
for (auto& iter : mAllocated) {
|
||||
total += iter.second / 1024.0f / 1024.0f;
|
||||
}
|
||||
for (auto& iter : mReusableBuffers) {
|
||||
total += iter.first / 1024.0f / 1024.0f;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
float MetalRuntime::onGetMemoryInMB() {
|
||||
return mStatic->computeSizeInMB() + mDynamic->computeSizeInMB();
|
||||
}
|
||||
|
||||
id<MTLBuffer> MetalRuntime::BufferAllocator::alloc(size_t size, bool seperate) {
|
||||
if (!seperate) {
|
||||
|
|
@ -164,10 +178,18 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
|
|||
}
|
||||
return exe;
|
||||
}
|
||||
void MetalBackend::flushEncoder() const {
|
||||
if (nil != mComputeEncoder) {
|
||||
[mComputeEncoder endEncoding];
|
||||
mComputeEncoder = nil;
|
||||
}
|
||||
}
|
||||
|
||||
void MetalBackend::onExecuteBegin() const {
|
||||
// do nothing
|
||||
flushEncoder();
|
||||
}
|
||||
void MetalBackend::onExecuteEnd() const {
|
||||
flushEncoder();
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
[ctx commit];
|
||||
}
|
||||
|
|
@ -176,7 +198,7 @@ id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
|
|||
return mRuntime->getHostBuffer(size);
|
||||
}
|
||||
|
||||
std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, const Tensor *tensor) {
|
||||
MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
|
||||
int s = 1, c = 1, b = 1;
|
||||
if (tensor->dimensions() == 4) {
|
||||
s = tensor->width() * tensor->height();
|
||||
|
|
@ -193,7 +215,6 @@ std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, cons
|
|||
int z = UP_DIV(c, 4);
|
||||
|
||||
// shape
|
||||
auto shape = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
((int *)shape.contents)[0] = s;
|
||||
((int *)shape.contents)[1] = c;
|
||||
((int *)shape.contents)[2] = z;
|
||||
|
|
@ -201,7 +222,7 @@ std::tuple<id<MTLBuffer>, MTLSize> getTensorShape(MNNMetalContext *context, cons
|
|||
|
||||
// threads
|
||||
MTLSize threads = {(NSUInteger)s, (NSUInteger)b * z, 1};
|
||||
return std::make_tuple(shape, threads);
|
||||
return threads;
|
||||
}
|
||||
|
||||
enum MetalCastType : int {
|
||||
|
|
@ -294,7 +315,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
1,
|
||||
1
|
||||
};
|
||||
auto limitBuffer = [ctx newDeviceBuffer:4 * sizeof(int) bytes:&limits access:CPUWriteOnly];
|
||||
auto limitBuffer = mRuntime->mStatic->alloc(4 * sizeof(int));
|
||||
::memcpy(limitBuffer.contents, limits, sizeof(limits));
|
||||
auto encoder = [ctx encoder];
|
||||
auto bandwidth = [ctx load: @"downcast_float4" encoder:encoder];
|
||||
[encoder setBuffer:host offset:0 atIndex:0];
|
||||
|
|
@ -304,6 +326,7 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
mRuntime->mStatic->release(limitBuffer);
|
||||
} else {
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
|
|
@ -312,7 +335,8 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
}
|
||||
// convert
|
||||
else {
|
||||
auto shape = getTensorShape(ctx, src);
|
||||
auto shape = mRuntime->mStatic->alloc(4 * sizeof(int));
|
||||
auto size = getTensorShape(shape, src);
|
||||
auto buffer = getHostBuffer(src->elementSize() * sizeof(float));
|
||||
memcpy(buffer.contents, src->host<float>(), src->size());
|
||||
auto encoder = [ctx encoder];
|
||||
|
|
@ -322,11 +346,12 @@ void MetalBackend::onCopyHostToDevice(const Tensor *src, const Tensor *dst) cons
|
|||
auto bandwidth = [ctx load:kernel encoder:encoder];
|
||||
[encoder setBuffer:buffer offset:0 atIndex:0];
|
||||
[encoder setBuffer:device offset:0 atIndex:1];
|
||||
[encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
mRuntime->mStatic->release(shape);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -356,7 +381,8 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
|
|||
1,
|
||||
1
|
||||
};
|
||||
auto limitBuffer = [ctx newDeviceBuffer:4 * sizeof(int) bytes:&limits access:CPUWriteOnly];
|
||||
auto limitBuffer = mRuntime->mStatic->alloc(4 * sizeof(int));
|
||||
::memcpy(limitBuffer.contents, limits, sizeof(limits));
|
||||
[encoder setBuffer:limitBuffer offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:{sizeC4, 1, 1} bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
|
|
@ -364,6 +390,7 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
|
|||
[ctx wait];
|
||||
|
||||
memcpy(dst->host<float>(), buffer.contents, dst->size());
|
||||
mRuntime->mStatic->release(limitBuffer);
|
||||
} else {
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
|
|
@ -372,7 +399,8 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
|
|||
}
|
||||
// convert
|
||||
else {
|
||||
auto shape = getTensorShape(ctx, src);
|
||||
auto shape = mRuntime->mStatic->alloc(4 * sizeof(int));
|
||||
auto size = getTensorShape(shape, src);
|
||||
auto buffer = getHostBuffer(dst->size());
|
||||
auto encoder = [ctx encoder];
|
||||
auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, Up);
|
||||
|
|
@ -381,12 +409,13 @@ void MetalBackend::onCopyDeviceToHost(const Tensor *src, const Tensor *dst) cons
|
|||
auto bandwidth = [ctx load:kernel encoder:encoder];
|
||||
[encoder setBuffer:device offset:0 atIndex:0];
|
||||
[encoder setBuffer:buffer offset:0 atIndex:1];
|
||||
[encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
[ctx commit];
|
||||
[ctx wait];
|
||||
memcpy(dst->host<float>(), buffer.contents, dst->size());
|
||||
mRuntime->mStatic->release(shape);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -406,7 +435,7 @@ void MetalBackend::AutoBuffer::reset(size_t length) {
|
|||
|
||||
|
||||
void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
||||
id<MTLComputeCommandEncoder> encoder) const {
|
||||
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
auto standalone = encoder == nil;
|
||||
encoder = encoder ?: [ctx encoder];
|
||||
|
|
@ -426,13 +455,21 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
|||
else {
|
||||
auto kernel = kernelForConvert(src->getType(), sfmt, dfmt, None);
|
||||
MNN_ASSERT(kernel != nil); // unsupported sfmt to dfmt
|
||||
bool needRelease = false;
|
||||
if (shape == nil) {
|
||||
shape = mRuntime->mStatic->alloc(4 * sizeof(int));
|
||||
needRelease = true;
|
||||
}
|
||||
|
||||
auto shape = getTensorShape(ctx, src);
|
||||
auto size = getTensorShape(shape, src);
|
||||
auto bandwidth = [ctx load:kernel encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)(src->buffer().device) offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)(dst->buffer().device) offset:0 atIndex:1];
|
||||
[encoder setBuffer:std::get<0>(shape) offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:std::get<1>(shape) bandwidth:bandwidth];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
[ctx dispatchEncoder:encoder threads:size bandwidth:bandwidth];
|
||||
if (needRelease) {
|
||||
mRuntime->mStatic->release(shape);
|
||||
}
|
||||
}
|
||||
|
||||
if (standalone) {
|
||||
|
|
@ -442,14 +479,23 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
|||
}
|
||||
|
||||
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
|
||||
onCopyBuffer(src, dst, nil);
|
||||
flushEncoder();
|
||||
onCopyBuffer(src, dst, nil, nil);
|
||||
}
|
||||
|
||||
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder) const {
|
||||
id<MTLComputeCommandEncoder> MetalBackend::encoder() const {
|
||||
if (nil == mComputeEncoder) {
|
||||
auto ctx = (__bridge MNNMetalContext *)context();
|
||||
mComputeEncoder = [ctx encoder];
|
||||
}
|
||||
return mComputeEncoder;
|
||||
}
|
||||
|
||||
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
|
||||
MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);
|
||||
|
||||
if (!src->buffer().host && !dst->buffer().host) {
|
||||
onCopyDeviceToDevice(src, dst, encoder);
|
||||
onCopyDeviceToDevice(src, dst, encoder, shape);
|
||||
} else if (!src->buffer().host && dst->buffer().host) {
|
||||
onCopyDeviceToHost(src, dst);
|
||||
} else if (src->buffer().host && !dst->buffer().host) {
|
||||
|
|
|
|||
|
|
@ -20,9 +20,12 @@ public:
|
|||
MetalBinary(Backend *backend, std::string type);
|
||||
virtual ~MetalBinary() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
std::string mKernelName;
|
||||
id<MTLBuffer> mConstBuffer;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -16,8 +16,6 @@ struct binary_op_shape {
|
|||
int i1stride;
|
||||
int output_data_count;
|
||||
int output_width;
|
||||
int output_size;
|
||||
int output_dimensions;
|
||||
};
|
||||
#define define_op(op) \
|
||||
kernel void binary_##op##_x1(const device ftype *in0 [[buffer(0)]],\
|
||||
|
|
|
|||
|
|
@ -15,33 +15,39 @@
|
|||
namespace MNN {
|
||||
|
||||
MetalBinary::MetalBinary(Backend *backend, std::string type) : Execution(backend) {
|
||||
mKernelName = "binary_" + type + "_x1";
|
||||
auto mKernelName = "binary_" + type + "_x1";
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
mConstBuffer = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
|
||||
mPipeline = [context pipelineWithName:kn];
|
||||
}
|
||||
|
||||
ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
|
||||
const int input0_data_count = (int)input0->elementSize();
|
||||
const int input1_data_count = (int)input1->elementSize();
|
||||
auto shape = [context newDeviceBuffer:6 * sizeof(int) access:CPUWriteOnly];
|
||||
auto encoder = [context encoder];
|
||||
|
||||
int outdatacount = output->elementSize();
|
||||
((int *)shape.contents)[0] = input0_data_count == 1 ? 0 : 1;
|
||||
((int *)shape.contents)[1] = input1_data_count == 1 ? 0 : 1;
|
||||
((int *)shape.contents)[2] = outdatacount;
|
||||
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
|
||||
((int *)mConstBuffer.contents)[0] = input0_data_count == 1 ? 0 : 1;
|
||||
((int *)mConstBuffer.contents)[1] = input1_data_count == 1 ? 0 : 1;
|
||||
((int *)mConstBuffer.contents)[2] = outdatacount;
|
||||
((int *)mConstBuffer.contents)[3] = 0;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outdatacount, 1, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto bandwidth = [context load:kn encoder:encoder];
|
||||
ErrorCode MetalBinary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
|
||||
[encoder setBuffer:shape offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger) outdatacount, 1, 1 } bandwidth:bandwidth];
|
||||
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -37,15 +37,13 @@ ErrorCode MetalCast::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
return NOT_SUPPORT;
|
||||
}
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
auto bandwidth = [context load:kernel encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) output->elementSize(), (NSUInteger)1, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
static DataType _mapDataType(DataType src) {
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ protected:
|
|||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
|
||||
private:
|
||||
const MNN::Op *mOp = nullptr;
|
||||
int mThreadgroupMemory = 0;
|
||||
bool mLocalPreferred = false;
|
||||
bool isThreadgroupLocalPreferred(const Tensor *input, const Tensor *output);
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
namespace MNN {
|
||||
|
||||
MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op) {
|
||||
mOp = op;
|
||||
loadWeight(op->main_as_Convolution2D());
|
||||
}
|
||||
|
||||
|
|
@ -52,17 +53,9 @@ ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
auto input = inputs[0], output = outputs[0];
|
||||
auto iw = input->width(), ih = input->height(), igz = UP_DIV(input->channel(), 4) / mGroups;
|
||||
auto ow = output->width(), oh = output->height(), ogz = UP_DIV(output->channel(), 4) / mGroups;
|
||||
|
||||
// pad mode support
|
||||
int padX = mPadX, padY = mPadY;
|
||||
if (mPadMode == PadMode_SAME) {
|
||||
int kernelWidthSize = (mKernelX - 1) * mDilateX + 1;
|
||||
int kernelHeightSize = (mKernelY - 1) * mDilateY + 1;
|
||||
int pw = (ow - 1) * mStrideX + kernelWidthSize - iw;
|
||||
int ph = (oh - 1) * mStrideY + kernelHeightSize - ih;
|
||||
padX = pw / 2;
|
||||
padY = ph / 2;
|
||||
}
|
||||
auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
|
||||
auto padX = pads.first;
|
||||
auto padY = pads.second;
|
||||
|
||||
// update threadgroup memory if needed
|
||||
int stepSlices = igz;
|
||||
|
|
@ -112,7 +105,7 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
|
|||
auto ib = iw * ih * iz * 4 * unit, ig = ib / mGroups;
|
||||
auto ob = ow * oh * oz * 4 * sizeof(metal_float), og = ob / mGroups;
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
auto bandwidth = (MetalBandwidth){};
|
||||
MTLSize threads = {};
|
||||
if (mLocalPreferred) {
|
||||
|
|
@ -144,7 +137,6 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
|
|||
}
|
||||
}
|
||||
}
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
@ -152,6 +144,12 @@ ErrorCode MetalConvolution::onFloat(const Tensor *input, const Tensor *output) {
|
|||
class MetalConvolutionCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
|
||||
auto param = op->main_as_Convolution2D();
|
||||
if (param->quanParameter() != nullptr) {
|
||||
if (param->quanParameter()->has_scaleInt()) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
if (op->type() == OpType_Convolution) {
|
||||
auto conv = op->main_as_Convolution2D();
|
||||
auto input = inputs[0];
|
||||
|
|
|
|||
|
|
@ -23,6 +23,9 @@ public:
|
|||
|
||||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
private:
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -51,27 +51,73 @@ kernel void conv1x1_g1z4(const device ftype4 *in [[buffer(0)]],
|
|||
const device ftype4x4 *wt [[buffer(3)]],
|
||||
const device ftype4 *biasTerms [[buffer(4)]],
|
||||
uint3 gid [[thread_position_in_grid]]) {
|
||||
if ((int)gid.x >= cst.output_size || (int)gid.y * CONV_UNROLL >= cst.output_slice || (int)gid.z >= cst.batch) return;
|
||||
if ((int)gid.x * CONV_UNROLL >= cst.output_size || (int)gid.y >= cst.output_slice || (int)gid.z >= cst.batch) return;
|
||||
|
||||
int uz = gid.y * CONV_UNROLL;
|
||||
auto xy_wt0 = wt + uz * cst.input_slice;
|
||||
auto xy_wt1 = uz + 1 < cst.output_slice ? xy_wt0 + cst.input_slice : nullptr;
|
||||
auto xy_wt2 = uz + 2 < cst.output_slice ? xy_wt1 + cst.input_slice : nullptr;
|
||||
auto xy_wt3 = uz + 3 < cst.output_slice ? xy_wt2 + cst.input_slice : nullptr;
|
||||
auto xy_in = in + (int)gid.z * cst.input_slice * cst.input_size + (int)gid.x;
|
||||
auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + uz * cst.output_size + (int)gid.x;
|
||||
|
||||
float4 result0 = 0, result1 = 0, result2 = 0, result3 = 0;
|
||||
for (auto z = 0; z < cst.input_slice; z++, xy_in += cst.input_size) {
|
||||
auto in4 = *xy_in;
|
||||
/* true */ result0 += float4(in4 * xy_wt0[z]);
|
||||
if (xy_wt1) result1 += float4(in4 * xy_wt1[z]);
|
||||
if (xy_wt2) result2 += float4(in4 * xy_wt2[z]);
|
||||
if (xy_wt3) result3 += float4(in4 * xy_wt3[z]);
|
||||
int rx = gid.x * CONV_UNROLL;
|
||||
int uz = gid.y;
|
||||
auto xy_wt = wt + uz * cst.input_slice;
|
||||
auto xy_in0 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 0;
|
||||
auto xy_in1 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 1;
|
||||
auto xy_in2 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 2;
|
||||
auto xy_in3 = in + (int)gid.z * cst.input_slice * cst.input_size + rx + 3;
|
||||
auto xy_out = out + (int)gid.z * cst.output_slice * cst.output_size + uz * cst.output_size + rx;
|
||||
auto biasValue = float4(biasTerms[uz]);
|
||||
float4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue;
|
||||
int computeSize = min(cst.output_size - rx, CONV_UNROLL);
|
||||
if (computeSize == CONV_UNROLL) {
|
||||
for (auto z = 0; z < cst.input_slice; z++) {
|
||||
auto in40 = *xy_in0;
|
||||
auto in41 = *xy_in1;
|
||||
auto in42 = *xy_in2;
|
||||
auto in43 = *xy_in3;
|
||||
auto w = xy_wt[z];
|
||||
|
||||
result0 += float4(in40 * w);
|
||||
result1 += float4(in41 * w);
|
||||
result2 += float4(in42 * w);
|
||||
result3 += float4(in43 * w);
|
||||
xy_in0 += cst.input_size;
|
||||
xy_in1 += cst.input_size;
|
||||
xy_in2 += cst.input_size;
|
||||
xy_in3 += cst.input_size;
|
||||
}
|
||||
} else if (computeSize == 3) {
|
||||
for (auto z = 0; z < cst.input_slice; z++) {
|
||||
auto in40 = *xy_in0;
|
||||
auto in41 = *xy_in1;
|
||||
auto in42 = *xy_in2;
|
||||
auto w = xy_wt[z];
|
||||
|
||||
result0 += float4(in40 * w);
|
||||
result1 += float4(in41 * w);
|
||||
result2 += float4(in42 * w);
|
||||
xy_in0 += cst.input_size;
|
||||
xy_in1 += cst.input_size;
|
||||
xy_in2 += cst.input_size;
|
||||
}
|
||||
} else if (computeSize == 2) {
|
||||
for (auto z = 0; z < cst.input_slice; z++) {
|
||||
auto in40 = *xy_in0;
|
||||
auto in41 = *xy_in1;
|
||||
auto w = xy_wt[z];
|
||||
|
||||
result0 += float4(in40 * w);
|
||||
result1 += float4(in41 * w);
|
||||
xy_in0 += cst.input_size;
|
||||
xy_in1 += cst.input_size;
|
||||
}
|
||||
} else {
|
||||
for (auto z = 0; z < cst.input_slice; z++) {
|
||||
auto in40 = *xy_in0;
|
||||
auto w = xy_wt[z];
|
||||
|
||||
result0 += float4(in40 * w);
|
||||
xy_in0 += cst.input_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* true */ *xy_out = activate(ftype4(result0 + float4(biasTerms[uz + 0])), cst.activation);
|
||||
if (xy_wt1) { xy_out += cst.output_size; *xy_out = activate(ftype4(result1 + float4(biasTerms[uz + 1])), cst.activation); }
|
||||
if (xy_wt2) { xy_out += cst.output_size; *xy_out = activate(ftype4(result2 + float4(biasTerms[uz + 2])), cst.activation); }
|
||||
if (xy_wt3) { xy_out += cst.output_size; *xy_out = activate(ftype4(result3 + float4(biasTerms[uz + 3])), cst.activation); }
|
||||
/* true */ *xy_out = activate(ftype4(result0), cst.activation);
|
||||
if (computeSize > 1) {xy_out[1] = activate(ftype4(result1), cst.activation); }
|
||||
if (computeSize > 2) {xy_out[2] = activate(ftype4(result2), cst.activation); }
|
||||
if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,38 +33,35 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
|||
auto input = inputs[0], output = outputs[0];
|
||||
auto is = input->width() * input->height(), iz = UP_DIV(input->channel(), 4), igz = iz / mGroups;
|
||||
auto os = output->width() * output->height(), oz = UP_DIV(output->channel(), 4), ogz = oz / mGroups;
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
|
||||
// create const buffer
|
||||
int constants[] = {is, igz, iz, os, ogz, oz, output->batch(), mActivationType};
|
||||
mConstBuffer.reset(sizeof(constants));
|
||||
::memcpy(mConstBuffer.buffer().contents, constants, sizeof(constants));
|
||||
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();;
|
||||
if (mGroups == 1 && (w * h >= 32)) {
|
||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z4"];
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:{(NSUInteger)UP_DIV(w * h, 4), (NSUInteger)z, (NSUInteger)b}];
|
||||
} else {
|
||||
mPipeline = [context pipelineWithName:@"conv1x1"];
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:{(NSUInteger)w * h, (NSUInteger)z, (NSUInteger)b}];
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolution1x1::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();;
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = (MetalBandwidth){};
|
||||
MTLSize threads = {};
|
||||
if (mGroups == 1 && (w * h * b >= 32 ? z >= 16 : z >= 128)) {
|
||||
bandwidth = [context load:@"conv1x1_g1z4" encoder:encoder];
|
||||
threads = {(NSUInteger)w * h, (NSUInteger)UP_DIV(z, 4), (NSUInteger)b};
|
||||
} else {
|
||||
bandwidth = [context load:@"conv1x1" encoder:encoder];
|
||||
threads = {(NSUInteger)w * h, (NSUInteger)z, (NSUInteger)b};
|
||||
}
|
||||
bandwidth.zAxisProtected = YES;
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[context dispatchEncoder:encoder threads:threads bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -23,6 +23,9 @@ public:
|
|||
protected:
|
||||
virtual ErrorCode onFloat(const Tensor *input, const Tensor *output) override;
|
||||
virtual id<MTLBuffer> weightForFloat(int group, int oc, int ic, int kh, int kw, const float *src) override;
|
||||
private:
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -60,24 +60,24 @@ ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &input
|
|||
mActivationType};
|
||||
mConstBuffer.reset(sizeof(constants));
|
||||
::memcpy(mConstBuffer.buffer().contents, constants, sizeof(constants));
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
mPipeline = [context pipelineWithName:@"conv_depthwise"];
|
||||
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w, h, z*b)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalConvolutionDepthwise::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:@"conv_depthwise" encoder:encoder];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger)w, (NSUInteger)h, (NSUInteger)z * b } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -112,6 +112,9 @@ id<MTLBuffer> MetalConvolutionDepthwise::weightForFloat(int group, int oc, int i
|
|||
class MetalConvolutionDepthwiseCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
|
||||
if (inputs.size() > 1) {
|
||||
return nullptr;
|
||||
}
|
||||
return new MetalConvolutionDepthwise(backend, op);
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -30,6 +30,12 @@ private:
|
|||
id<MTLBuffer> mShapeBuffer = nil;
|
||||
std::shared_ptr<Tensor> mTempInput;
|
||||
std::shared_ptr<Tensor> mTempOutput;
|
||||
id<MTLComputePipelineState> mPipelineGEMM;
|
||||
std::pair<MTLSize, MTLSize> mGemm;
|
||||
id<MTLComputePipelineState> mPipelineIm2Col;
|
||||
std::pair<MTLSize, MTLSize> mIm2Col;
|
||||
id<MTLComputePipelineState> mPipelineCol2Im;
|
||||
std::pair<MTLSize, MTLSize> mCol2Im;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ bool MetalConvolutionGEMM::isValid(const Convolution2D *conv, const Tensor *inpu
|
|||
if (kx == 1 || ky == 1 || common->group() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto oc = common->outputCount();
|
||||
if (oc <= 64) {
|
||||
return false;
|
||||
|
|
@ -106,6 +105,14 @@ ErrorCode MetalConvolutionGEMM::onResize(const std::vector<Tensor *> &inputs, co
|
|||
}
|
||||
backend->onReleaseBuffer(mTempInput.get(), Backend::DYNAMIC);
|
||||
backend->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
mPipelineGEMM = [context pipelineWithName:@"matmul4x4"];
|
||||
mPipelineIm2Col = [context pipelineWithName:@"conv_im2col"];
|
||||
mPipelineCol2Im = [context pipelineWithName:@"conv_col2im"];
|
||||
NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
|
||||
NSUInteger gh = UP_DIV(output->channel(), 4);
|
||||
mGemm = [context computeBestGroupAndLocal:mPipelineGEMM threads:{gw, gh, 1}];
|
||||
mIm2Col = [context computeBestGroupAndLocal:mPipelineIm2Col threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)iz*ob}];
|
||||
mCol2Im = [context computeBestGroupAndLocal:mPipelineCol2Im threads:{(NSUInteger)ow, (NSUInteger)oh, (NSUInteger)oz*ob}];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -115,42 +122,31 @@ ErrorCode MetalConvolutionGEMM::onExecute(const std::vector<Tensor *> &inputs, c
|
|||
|
||||
ErrorCode MetalConvolutionGEMM::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
|
||||
{ // im2col
|
||||
NSUInteger iz = UP_DIV(input->channel(), 4), ib = input->batch();
|
||||
NSUInteger ow = output->width(), oh = output->height();
|
||||
|
||||
auto bandwidth = [context load:@"conv_im2col" encoder:encoder];
|
||||
[encoder setComputePipelineState:mPipelineIm2Col];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempInput->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder threads:{ ow, oh, iz *ib } bandwidth:bandwidth];
|
||||
[encoder dispatchThreadgroups:mIm2Col.first threadsPerThreadgroup:mIm2Col.second];
|
||||
}
|
||||
{ // gemm
|
||||
NSUInteger gw = UP_DIV(output->width() * output->height() * output->batch(), 4);
|
||||
NSUInteger gh = UP_DIV(output->channel(), 4);
|
||||
|
||||
auto bandwidth = [context load:@"matmul4x4" encoder:encoder];
|
||||
[encoder setComputePipelineState:mPipelineGEMM];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempInput->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempOutput->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:2];
|
||||
[encoder setBuffer:mShapeBuffer offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:{ gw, gh, 1 } bandwidth:bandwidth];
|
||||
[encoder dispatchThreadgroups:mGemm.first threadsPerThreadgroup:mGemm.second];
|
||||
}
|
||||
{ // col2im
|
||||
NSUInteger ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4), ob = output->batch();
|
||||
|
||||
auto bandwidth = [context load:@"conv_col2im" encoder:encoder];
|
||||
[encoder setComputePipelineState:mPipelineCol2Im];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)mTempOutput->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:2];
|
||||
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:{ ow, oh, oz *ob } bandwidth:bandwidth];
|
||||
[encoder dispatchThreadgroups:mCol2Im.first threadsPerThreadgroup:mCol2Im.second];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ ErrorCode MetalConvolutionWinograd::onResize(const std::vector<Tensor *> &inputs
|
|||
ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
{ // transform
|
||||
auto bandwidth = [context load:mKernelX == 3 ? @"winograd_transform_source2_3_1" : @"winograd_transform_source2_5_1" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
|
|
@ -155,7 +155,6 @@ ErrorCode MetalConvolutionWinograd::onFloat(const Tensor *input, const Tensor *o
|
|||
[encoder setBuffer:mConstBuffer.buffer() offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:mOutputTransformThreads bandwidth:bandwidth];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
|
||||
return NO_ERROR;
|
||||
|
|
|
|||
|
|
@ -39,10 +39,9 @@ private:
|
|||
id<MTLBuffer> mWeight = nil;
|
||||
id<MTLBuffer> mBias = nil;
|
||||
id<MTLBuffer> mConstBuffer = nil;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
|
||||
private:
|
||||
ErrorCode onDepthwise(const Tensor *input, const Tensor *output);
|
||||
ErrorCode onDeconv(const Tensor *input, const Tensor *output);
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ struct deconv_constants {
|
|||
int delta_iy;
|
||||
int delta_ix;
|
||||
int has_bias;
|
||||
int batch;
|
||||
};
|
||||
|
||||
kernel void deconv(const device ftype4 *in [[buffer(0)]],
|
||||
|
|
@ -44,7 +45,7 @@ kernel void deconv(const device ftype4 *in [[buffer(0)]],
|
|||
const device ftype4x4 *wt [[buffer(3)]],
|
||||
const device ftype4 *biasTerms [[buffer(4)]],
|
||||
uint3 gid [[thread_position_in_grid]]) {
|
||||
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height) return;
|
||||
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
|
||||
|
||||
short b = gid.z / cst.output_slice;
|
||||
short o = gid.z % cst.output_slice;
|
||||
|
|
@ -86,9 +87,9 @@ kernel void deconv_depthwise(const device ftype4 *in [[buffer(0)]],
|
|||
const device ftype4 *wt [[buffer(3)]],
|
||||
const device ftype4 *biasTerms [[buffer(4)]],
|
||||
ushort3 gid [[thread_position_in_grid]]) {
|
||||
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height) return;
|
||||
if ((int)gid.x >= cst.output_width || (int)gid.y >= cst.output_height || (int)gid.z >= cst.batch * cst.output_slice) return;
|
||||
|
||||
float4 result = float4(biasTerms[(short)gid.z]);
|
||||
float4 result = float4(biasTerms[(short)(gid.z % cst.input_slice)]);
|
||||
|
||||
short oy = (short)gid.y + cst.pad_y;
|
||||
short ox = (short)gid.x + cst.pad_x;
|
||||
|
|
|
|||
|
|
@ -140,6 +140,11 @@ MetalDeconvolution::MetalDeconvolution(Backend *backend, const MNN::Op *op) : Ex
|
|||
}
|
||||
mWeight = weightForDeconv(context, mDepthwise, deconv, qnt.get());
|
||||
mBias = biasForDeconv(context, deconv);
|
||||
if (mDepthwise) {
|
||||
mPipeline = [context pipelineWithName:@"deconv_depthwise"];
|
||||
} else {
|
||||
mPipeline = [context pipelineWithName:@"deconv"];
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
|
@ -148,6 +153,7 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
auto input = inputs[0], output = outputs[0];
|
||||
int iw = input->width(), ih = input->height(), iz = UP_DIV(input->channel(), 4);
|
||||
int ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4);
|
||||
int ob = output->batch();
|
||||
|
||||
// pad mode support
|
||||
int padX = mPadX, padY = mPadY;
|
||||
|
|
@ -185,63 +191,26 @@ ErrorCode MetalDeconvolution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
deltaKy * mDilateY / mStrideY,
|
||||
deltaKx * mDilateX / mStrideX,
|
||||
mBias.length > 0,
|
||||
ob
|
||||
};
|
||||
mConstBuffer = [context newDeviceBuffer:sizeof(consts) bytes:consts access:CPUWriteOnly];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalDeconvolution::onDepthwise(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
int unit = sizeof(metal_float);
|
||||
auto iw = input->width(), ih = input->height(), iz = UP_DIV(input->channel(), 4), ib = iw * ih * iz * 4 * unit;
|
||||
auto ow = output->width(), oh = output->height(), oz = UP_DIV(output->channel(), 4), ob = ow * oh * oz * 4 * unit;
|
||||
|
||||
// run
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:@"deconv_depthwise" encoder:encoder];
|
||||
for (int b = 0; b < input->batch(); b++) {
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:b * ib atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:b * ob atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz }
|
||||
bandwidth:bandwidth];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
ErrorCode MetalDeconvolution::onDeconv(const Tensor *input, const Tensor *output) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
int ow = output->width(), oh = output->height(), oc = output->channel(), oz = UP_DIV(oc, 4), ob = output->batch();
|
||||
|
||||
// run
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:@"deconv" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalDeconvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
if (mDepthwise) {
|
||||
return onDepthwise(input, output);
|
||||
} else {
|
||||
return onDeconv(input, output);
|
||||
}
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
// run
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[encoder setBuffer:mWeight offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalDeconvolutionCreator : public MetalBackend::Creator {
|
||||
|
|
|
|||
|
|
@ -1,39 +0,0 @@
|
|||
//
|
||||
// MetalDequantize.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/30.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MetalDequantize_hpp
|
||||
#define MetalDequantize_hpp
|
||||
|
||||
#import "core/Execution.hpp"
|
||||
#import "MNN_generated.h"
|
||||
#import "MetalDefine.h"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
class MetalDequantize : public Execution {
|
||||
public:
|
||||
MetalDequantize(Backend *backend, const Dequantize *dq);
|
||||
virtual ~MetalDequantize() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
ErrorCode onTFLite(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onMinCombined(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onMinFirst(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onScaled(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
|
||||
private:
|
||||
ModeFormat mModeFormat;
|
||||
DataType mType;
|
||||
QuantizeMode mMode;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
#endif /* MNN_METAL_ENABLED */
|
||||
#endif /* MetalDequantize_hpp */
|
||||
|
|
@ -1,186 +0,0 @@
|
|||
//
|
||||
// MetalDequantize.metal
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2018/11/13.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include <metal_stdlib>
|
||||
#include "MetalDefine.metal"
|
||||
|
||||
using namespace metal;
|
||||
|
||||
kernel void dequantize_min_combined_uint8(const device uchar *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float factor = (max_range - min_range) / UINT8_MAX;
|
||||
output[int(gid)] = input0[int(gid)] * factor + min_range;
|
||||
}
|
||||
kernel void dequantize_min_combined_uint16(const device ushort *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float factor = (max_range - min_range) / UINT16_MAX;
|
||||
output[int(gid)] = input0[int(gid)] * factor + min_range;
|
||||
}
|
||||
kernel void dequantize_min_combined_int8(const device char *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float half_range = (INT8_MAX - INT8_MIN + 1) / 2.f;
|
||||
float factor = (max_range - min_range) / (INT8_MAX - INT8_MIN);
|
||||
output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
|
||||
}
|
||||
kernel void dequantize_min_combined_int16(const device short *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float half_range = (INT16_MAX - INT16_MIN + 1) / 2.f;
|
||||
float factor = (max_range - min_range) / (INT16_MAX - INT16_MIN);
|
||||
output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
|
||||
}
|
||||
kernel void dequantize_min_combined_int32(const device int *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float half_range = (INT32_MAX - INT32_MIN + 1) / 2.f;
|
||||
float factor = (max_range - min_range) / (INT32_MAX - INT32_MIN);
|
||||
output[int(gid)] = (input0[int(gid)] + half_range) * factor + min_range;
|
||||
}
|
||||
|
||||
|
||||
|
||||
kernel void dequantize_min_first_uint8(const device uchar *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float mask = float(uchar(~0));
|
||||
float range_scale = (max_range - min_range) / mask;
|
||||
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
|
||||
output[int(gid)] = input0[int(gid)] * range_scale + range_min_rounded;
|
||||
}
|
||||
kernel void dequantize_min_first_uint16(const device ushort *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float mask = float(ushort(~0));
|
||||
float range_scale = (max_range - min_range) / mask;
|
||||
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
|
||||
output[int(gid)] = input0[int(gid)] * range_scale + range_min_rounded;
|
||||
}
|
||||
kernel void dequantize_min_first_int8(const device char *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float mask = float(uchar(~0));
|
||||
float range_scale = (max_range - min_range) / mask;
|
||||
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
|
||||
float lowest_quantized = float(INT8_MIN);
|
||||
float result_add = range_min_rounded - lowest_quantized * range_scale;
|
||||
output[int(gid)] = input0[int(gid)] * range_scale + result_add;
|
||||
}
|
||||
kernel void dequantize_min_first_int16(const device short *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float mask = float(ushort(~0));
|
||||
float range_scale = (max_range - min_range) / mask;
|
||||
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
|
||||
float lowest_quantized = float(INT16_MIN);
|
||||
float result_add = range_min_rounded - lowest_quantized * range_scale;
|
||||
output[int(gid)] = input0[int(gid)] * range_scale + result_add;
|
||||
}
|
||||
kernel void dequantize_min_first_int32(const device int *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float mask = float(uint(~0));
|
||||
float range_scale = (max_range - min_range) / mask;
|
||||
float range_min_rounded = max_range == min_range ? min_range : round(min_range / range_scale) * range_scale;
|
||||
float lowest_quantized = float(INT32_MIN);
|
||||
float result_add = range_min_rounded - lowest_quantized * range_scale;
|
||||
output[int(gid)] = input0[int(gid)] * range_scale + result_add;
|
||||
}
|
||||
|
||||
|
||||
|
||||
kernel void dequantize_scaled_uint8(const device uchar *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float max_range = input2[0];
|
||||
float factor = max_range / UINT8_MAX;
|
||||
output[int(gid)] = input0[int(gid)] * factor;
|
||||
}
|
||||
kernel void dequantize_scaled_uint16(const device ushort *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float max_range = input2[0];
|
||||
float factor = max_range / UINT16_MAX;
|
||||
output[int(gid)] = input0[int(gid)] * factor;
|
||||
}
|
||||
kernel void dequantize_scaled_int8(const device char *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float factor = max(min_range / INT8_MIN, max_range / INT8_MAX);
|
||||
output[int(gid)] = input0[int(gid)] * factor;
|
||||
}
|
||||
kernel void dequantize_scaled_int16(const device short *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float factor = max(min_range / INT16_MIN, max_range / INT16_MAX);
|
||||
output[int(gid)] = input0[int(gid)] * factor;
|
||||
}
|
||||
kernel void dequantize_scaled_int32(const device int *input0 [[buffer(0)]],
|
||||
const device ftype *input1 [[buffer(1)]],
|
||||
const device ftype *input2 [[buffer(2)]],
|
||||
device ftype *output [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
float min_range = input1[0];
|
||||
float max_range = input2[0];
|
||||
float factor = max(min_range / INT32_MIN, max_range / INT32_MAX);
|
||||
output[int(gid)] = input0[int(gid)] * factor;
|
||||
}
|
||||
|
|
@ -1,151 +0,0 @@
|
|||
//
|
||||
// MetalDequantize.mm
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2019/01/30.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#import "backend/metal/MetalDequantize.hpp"
|
||||
#import "backend/metal/MNNMetalContext.h"
|
||||
#import "core/Macro.h"
|
||||
#import "backend/metal/MetalBackend.hpp"
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalDequantize::MetalDequantize(Backend *backend, const Dequantize *dq)
|
||||
: Execution(backend), mModeFormat(dq->modelFormat()), mType(dq->type()), mMode(dq->mode()) {
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
ErrorCode MetalDequantize::onMinCombined(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = (MetalBandwidth){};
|
||||
switch (mType) {
|
||||
case DataType_DT_QUINT8:
|
||||
bandwidth = [context load:@"dequantize_min_combined_uint8" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QUINT16:
|
||||
bandwidth = [context load:@"dequantize_min_combined_uint16" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT8:
|
||||
bandwidth = [context load:@"dequantize_min_combined_int8" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT16:
|
||||
bandwidth = [context load:@"dequantize_min_combined_int16" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT32:
|
||||
bandwidth = [context load:@"dequantize_min_combined_int32" encoder:encoder];
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false); // unsupported type
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalDequantize::onMinFirst(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = (MetalBandwidth){};
|
||||
switch (mType) {
|
||||
case DataType_DT_QUINT8:
|
||||
bandwidth = [context load:@"dequantize_min_first_uint8" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QUINT16:
|
||||
bandwidth = [context load:@"dequantize_min_first_uint16" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT8:
|
||||
bandwidth = [context load:@"dequantize_min_first_int8" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT16:
|
||||
bandwidth = [context load:@"dequantize_min_first_int16" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT32:
|
||||
bandwidth = [context load:@"dequantize_min_first_int32" encoder:encoder];
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false); // unsupported type
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalDequantize::onScaled(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto &input0 = inputs[0], &input1 = inputs[1], &input2 = inputs[2], &output0 = outputs[0];
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = (MetalBandwidth){};
|
||||
switch (mType) {
|
||||
case DataType_DT_QUINT8:
|
||||
bandwidth = [context load:@"dequantize_scaled_uint8" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QUINT16:
|
||||
bandwidth = [context load:@"dequantize_scaled_uint16" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT8:
|
||||
bandwidth = [context load:@"dequantize_scaled_int8" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT16:
|
||||
bandwidth = [context load:@"dequantize_scaled_int16" encoder:encoder];
|
||||
break;
|
||||
case DataType_DT_QINT32:
|
||||
bandwidth = [context load:@"dequantize_scaled_int32" encoder:encoder];
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false); // unsupported type
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input2->deviceId() offset:0 atIndex:2];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output0->deviceId() offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger) output0->elementSize(), 1, 1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalDequantize::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
switch (mMode) {
|
||||
case QuantizeMode_MIN_COMBINED:
|
||||
return onMinCombined(inputs, outputs);
|
||||
case QuantizeMode_MIN_FIRST:
|
||||
return onMinFirst(inputs, outputs);
|
||||
case QuantizeMode_SCALED:
|
||||
return onScaled(inputs, outputs);
|
||||
default:
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
}
|
||||
|
||||
class MetalDequantizeCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
|
||||
return new MetalDequantize(backend, op->main_as_Dequantize());
|
||||
}
|
||||
};
|
||||
REGISTER_METAL_OP_CREATOR(MetalDequantizeCreator, OpType_Dequantize);
|
||||
} // namespace MNN
|
||||
#endif /* MNN_METAL_ENABLED */
|
||||
|
|
@ -21,10 +21,13 @@ public:
|
|||
MetalEltwise(Backend *backend, EltwiseType type);
|
||||
virtual ~MetalEltwise() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
EltwiseType mType;
|
||||
void encode(NSString *kernel, const Tensor *input0, const Tensor *input1, const Tensor *output);
|
||||
void encode(const Tensor *input0, const Tensor *input1, const Tensor *output);
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
id<MTLBuffer> mConst;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -11,23 +11,33 @@
|
|||
|
||||
using namespace metal;
|
||||
|
||||
kernel void eltwise_prod(device const ftype4 *in0 [[buffer(0)]],
|
||||
device const ftype4 *in1 [[buffer(1)]],
|
||||
device ftype4 *out [[buffer(2)]],
|
||||
|
||||
kernel void eltwise_prod(device const ftype *in0 [[buffer(0)]],
|
||||
device const ftype *in1 [[buffer(1)]],
|
||||
device ftype *out [[buffer(2)]],
|
||||
constant int4& shape [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
out[(int)gid] = in0[(int)gid] * in1[(int)gid];
|
||||
if ((int)gid < shape.x) {
|
||||
out[(int)gid] = in0[(int)gid] * in1[(int)gid];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void eltwise_max(device const ftype4 *in0 [[buffer(0)]],
|
||||
device const ftype4 *in1 [[buffer(1)]],
|
||||
device ftype4 *out [[buffer(2)]],
|
||||
kernel void eltwise_max(device const ftype *in0 [[buffer(0)]],
|
||||
device const ftype *in1 [[buffer(1)]],
|
||||
device ftype *out [[buffer(2)]],
|
||||
constant int4& shape [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
out[(int)gid] = max(in0[(int)gid], in1[(int)gid]);
|
||||
if ((int)gid < shape.x) {
|
||||
out[(int)gid] = max(in0[(int)gid], in1[(int)gid]);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void eltwise_add(device const ftype4 *in0 [[buffer(0)]],
|
||||
device const ftype4 *in1 [[buffer(1)]],
|
||||
device ftype4 *out [[buffer(2)]],
|
||||
kernel void eltwise_add(device const ftype *in0 [[buffer(0)]],
|
||||
device const ftype *in1 [[buffer(1)]],
|
||||
device ftype *out [[buffer(2)]],
|
||||
constant int4& shape [[buffer(3)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
out[(int)gid] = in0[(int)gid] + in1[(int)gid];
|
||||
if ((int)gid < shape.x) {
|
||||
out[(int)gid] = in0[(int)gid] + in1[(int)gid];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,26 +14,12 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend), mType(type) {
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
void MetalEltwise::encode(NSString *kernel, const Tensor *input0, const Tensor *input1, const Tensor *output) {
|
||||
auto metal = static_cast<MetalBackend *>(this->backend());
|
||||
MetalEltwise::MetalEltwise(Backend *backend, EltwiseType type) : Execution(backend) {
|
||||
auto metal = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)metal->context();
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:kernel encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger)output->elementSize() / 4, 1, 1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
}
|
||||
|
||||
ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
NSString *kernel = nil;
|
||||
switch (mType) {
|
||||
switch (type) {
|
||||
case EltwiseType_PROD:
|
||||
kernel = @"eltwise_prod";
|
||||
break;
|
||||
|
|
@ -46,11 +32,32 @@ ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std
|
|||
default:
|
||||
break;
|
||||
}
|
||||
mPipeline = [context pipelineWithName:kernel];
|
||||
}
|
||||
ErrorCode MetalEltwise::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
((int*)(mConst.contents))[0] = outputs[0]->elementSize();
|
||||
auto metal = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)metal->context();
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outputs[0]->elementSize(), 1, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
void MetalEltwise::encode(const Tensor *input0, const Tensor *input1, const Tensor *output) {
|
||||
auto metal = static_cast<MetalBackend *>(this->backend());
|
||||
auto encoder = metal->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input1->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:2];
|
||||
[encoder setBuffer:mConst offset:0 atIndex:3];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
}
|
||||
|
||||
ErrorCode MetalEltwise::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto output = outputs[0];
|
||||
encode(kernel, inputs[0], inputs[1], output);
|
||||
encode(inputs[0], inputs[1], output);
|
||||
for (int i = 2; i < inputs.size(); i++) {
|
||||
encode(kernel, inputs[i], output, output);
|
||||
encode(inputs[i], output, output);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,11 +20,14 @@ public:
|
|||
MetalInterp(Backend *backend, const Op* op);
|
||||
virtual ~MetalInterp() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
int32_t mReiszeType;
|
||||
id<MTLBuffer> mCordTransform;
|
||||
id<MTLBuffer> mShape;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -28,8 +28,7 @@ MetalInterp::MetalInterp(Backend *backend, const Op* op)
|
|||
mReiszeType = interpParam->resizeType();
|
||||
mShape = [context newDeviceBuffer:7 * sizeof(int) access:CPUWriteOnly];
|
||||
}
|
||||
|
||||
ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
ErrorCode MetalInterp::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
|
|
@ -43,38 +42,44 @@ ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
((int *)mShape.contents)[4] = oh;
|
||||
((int *)mShape.contents)[5] = ow * oh;
|
||||
((int *)mShape.contents)[6] = slice;
|
||||
|
||||
|
||||
// encode
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = (MetalBandwidth){};
|
||||
if (mReiszeType == 2 || mReiszeType == 1) {
|
||||
if (2 == mReiszeType) {
|
||||
bandwidth = [context load:@"resize_bilinear" encoder:encoder];
|
||||
mPipeline = [context pipelineWithName:@"resize_bilinear"];
|
||||
} else {
|
||||
bandwidth = [context load:@"resize_nearest" encoder:encoder];
|
||||
mPipeline = [context pipelineWithName:@"resize_nearest"];
|
||||
}
|
||||
} else if (mReiszeType == 3) {
|
||||
bandwidth = [context load:@"resize_cubic" encoder:encoder];
|
||||
mPipeline = [context pipelineWithName:@"resize_cubic"];
|
||||
} else {
|
||||
MNN_ASSERT(false);
|
||||
}
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(ow, oh, slice)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
||||
ErrorCode MetalInterp::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
// encode
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mShape offset:0 atIndex:2];
|
||||
[encoder setBuffer:mCordTransform offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)slice }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class MetalInterpCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
|
||||
auto interpParam = op->main_as_Interp();
|
||||
auto type = interpParam->resizeType();
|
||||
if (type > 3) {
|
||||
return nullptr;
|
||||
}
|
||||
return new MetalInterp(backend, op);
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
auto e = C->length(0);
|
||||
auto h = C->length(1);
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
if (inputs.size() > 2) {
|
||||
auto bandwidth = [context load:@"matmul_bias" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
|
|
@ -78,7 +78,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
} else {
|
||||
auto bandwidth = [context load:@"matmul" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input0->deviceId() offset:0 atIndex:0];
|
||||
|
|
@ -88,7 +87,6 @@ ErrorCode MetalMatMul::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger)h, (NSUInteger)e, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
}
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@
|
|||
extern void ___MetalReLUCreator__OpType_ReLU__();
|
||||
extern void ___MetalPoolingCreator__OpType_Pooling__();
|
||||
extern void ___MetalScaleCreator__OpType_Scale__();
|
||||
extern void ___MetalDequantizeCreator__OpType_Dequantize__();
|
||||
extern void ___MetalInterpCreator__OpType_Interp__();
|
||||
extern void ___MetalUnaryCreator__OpType_UnaryOp__();
|
||||
extern void ___MetalRasterCreator__OpType_Raster__();
|
||||
|
|
@ -33,7 +32,6 @@ void registerMetalOps() {
|
|||
___MetalReLUCreator__OpType_ReLU__();
|
||||
___MetalPoolingCreator__OpType_Pooling__();
|
||||
___MetalScaleCreator__OpType_Scale__();
|
||||
___MetalDequantizeCreator__OpType_Dequantize__();
|
||||
___MetalInterpCreator__OpType_Interp__();
|
||||
___MetalUnaryCreator__OpType_UnaryOp__();
|
||||
___MetalRasterCreator__OpType_Raster__();
|
||||
|
|
|
|||
|
|
@ -20,9 +20,13 @@ public:
|
|||
MetalPReLU(Backend *backend, const float *slope, int count);
|
||||
virtual ~MetalPReLU() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
id<MTLBuffer> mSlope;
|
||||
id<MTLBuffer> mShape;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
bool mShareChannel = false;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -18,35 +18,41 @@ MetalPReLU::MetalPReLU(Backend *backend, const float *slope, int count) : Execut
|
|||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
|
||||
mSlope = [context newDeviceBuffer:UP_DIV(count, 4) * 4 * sizeof(float) bytes:slope access:CPUWriteOnly];
|
||||
mShareChannel = 1 == count;
|
||||
if (!mShareChannel) {
|
||||
mShape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
|
||||
}
|
||||
mPipeline = [context pipelineWithName:mShareChannel ? @"prelu" : @"prelu_slopes"];
|
||||
}
|
||||
|
||||
ErrorCode MetalPReLU::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto output = outputs[0];
|
||||
int w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
|
||||
if (mShareChannel) {
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w * h * z * b, 1, 1)];
|
||||
} else {
|
||||
((int *)mShape.contents)[0] = w * h;
|
||||
((int *)mShape.contents)[1] = z;
|
||||
((int *)mShape.contents)[2] = b;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w * h, z, b)];
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalPReLU::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
int w = output->width(), h = output->height(), z = UP_DIV(output->channel(), 4), b = output->batch();
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:mShareChannel ? @"prelu" : @"prelu_slopes" encoder:encoder];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mSlope offset:0 atIndex:2];
|
||||
if (mShareChannel) {
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) w * h * z * b, (NSUInteger)1, (NSUInteger)1 }
|
||||
bandwidth:bandwidth];
|
||||
} else {
|
||||
auto shape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
|
||||
((int *)shape.contents)[0] = w * h;
|
||||
((int *)shape.contents)[1] = z;
|
||||
((int *)shape.contents)[2] = b;
|
||||
[encoder setBuffer:shape offset:0 atIndex:3];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) w * h, (NSUInteger)z, (NSUInteger)b }
|
||||
bandwidth:bandwidth];
|
||||
if (!mShareChannel) {
|
||||
[encoder setBuffer:mShape offset:0 atIndex:3];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,9 @@ private:
|
|||
int mPadX;
|
||||
int mPadY;
|
||||
id<MTLBuffer> mConstBuffer;
|
||||
MTLSize mGroup;
|
||||
MTLSize mLocal;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -59,27 +59,23 @@ ErrorCode MetalPooling::onResize(const std::vector<Tensor *> &inputs, const std:
|
|||
((int *)mConstBuffer.contents)[8] = strideHeight;
|
||||
((int *)mConstBuffer.contents)[9] = padWidth;
|
||||
((int *)mConstBuffer.contents)[10] = padHeight;
|
||||
|
||||
auto ow = output->width(), oh = output->height(), slice = UP_DIV(output->channel(), 4) * output->batch();
|
||||
mPipeline = [context pipelineWithName:(mPoolType == PoolType_MAXPOOL) ? @"pooling_max" : @"pooling_avg"];
|
||||
auto size = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(ow, oh, slice)];
|
||||
mLocal = size.second;
|
||||
mGroup = size.first;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalPooling::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
auto ow = output->width(), oh = output->height(), slice = UP_DIV(output->channel(), 4) * output->batch();
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:(mPoolType == PoolType_MAXPOOL) ? @"pooling_max" : @"pooling_avg" encoder:encoder];
|
||||
bandwidth.zAxisProtected = YES;
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)slice }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder dispatchThreadgroups:mGroup threadsPerThreadgroup:mLocal];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ ErrorCode MetalROIPooling::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
((int *)shape.contents)[6] = oz;
|
||||
((float *)shape.contents)[7] = mSpatialScale;
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
auto bandwidth = [context load:@"ROI_pooling" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)roi->deviceId() offset:0 atIndex:1];
|
||||
|
|
@ -45,7 +45,6 @@ ErrorCode MetalROIPooling::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz *ob }
|
||||
bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,10 +24,13 @@ public:
|
|||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
private:
|
||||
std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
|
||||
std::vector<std::tuple<id<MTLBuffer>, id<MTLBuffer>, std::vector<int> > > mTempInputCopy;
|
||||
std::vector<std::tuple<id<MTLBuffer>, id<MTLBuffer>, MTLSize, MTLSize> > mTempInputCopy;
|
||||
std::shared_ptr<Tensor> mTempOutput;
|
||||
bool mNeedZero = false;
|
||||
id<MTLBuffer> mOutputPtr;
|
||||
bool mFast = false;
|
||||
id<MTLComputePipelineState> mBlitPipeline;
|
||||
std::vector<id<MTLBuffer>> mShapeTemp;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -6,12 +6,14 @@
|
|||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#if MNN_METAL_ENABLED
|
||||
#import "backend/metal/MetalRaster.hpp"
|
||||
#import "backend/metal/MNNMetalContext.h"
|
||||
#import "core/Macro.h"
|
||||
#import "backend/metal/MetalBackend.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#if MNN_METAL_ENABLED
|
||||
#include "core/OpCommonUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
struct SamplerInfo {
|
||||
|
|
@ -44,10 +46,67 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
auto des = TensorUtils::getDescribe(input);
|
||||
auto outputDes = TensorUtils::getDescribe(output);
|
||||
mNeedZero = !TensorUtils::regionIsFull(input);
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend())->context();
|
||||
auto bytes = outputs[0]->getType().bytes();
|
||||
|
||||
mTempInput.clear();
|
||||
mTempOutput = nullptr;
|
||||
mOutputPtr = (__bridge id<MTLBuffer>)((void*)output->deviceId());
|
||||
#ifndef MNN_METAL_FORBID_RASTER_C4
|
||||
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||
bool fast = true;
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
||||
fast = false;
|
||||
break;
|
||||
}
|
||||
if (!OpCommonUtils::canBlitFast(slice, output)) {
|
||||
fast = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mFast = fast;
|
||||
if (fast) {
|
||||
NSString* kernelName = nil;
|
||||
switch (bytes) {
|
||||
case 4:
|
||||
kernelName = @"blit_int32x4";
|
||||
break;
|
||||
case 2:
|
||||
kernelName = @"blit_int64";
|
||||
break;
|
||||
case 1:
|
||||
kernelName = @"blit_int32";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (outputs[0]->getType().code == halide_type_float) {
|
||||
#if MNN_METAL_FULL_PRECISION
|
||||
kernelName = @"blit_int32x4";
|
||||
#else
|
||||
kernelName = @"blit_int64";
|
||||
#endif
|
||||
}
|
||||
mBlitPipeline = [context pipelineWithName:kernelName];
|
||||
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
Tensor::InsideDescribe::Region newRegion;
|
||||
OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
|
||||
newRegion.dst.offset /= 4;
|
||||
newRegion.src.offset /= 4;
|
||||
SamplerInfo info;
|
||||
writeSamplerInfo(info, newRegion);
|
||||
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(newRegion.size[0], newRegion.size[1], newRegion.size[2])];
|
||||
auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
|
||||
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)newRegion.origin->deviceId(), buffer, local.first, local.second));
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
auto origin = slice.origin;
|
||||
|
|
@ -87,47 +146,6 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
if (nullptr != mTempOutput) {
|
||||
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
}
|
||||
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend())->context();
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
if (nullptr == slice.origin) {
|
||||
continue;
|
||||
}
|
||||
SamplerInfo info;
|
||||
writeSamplerInfo(info, slice);
|
||||
auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
|
||||
|
||||
auto iter = mTempInput.find(slice.origin);
|
||||
std::vector<int> regionSize = {
|
||||
slice.size[0], slice.size[1], slice.size[2]
|
||||
};
|
||||
if (iter != mTempInput.end()) {
|
||||
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)iter->second->deviceId(), buffer, regionSize));
|
||||
continue;
|
||||
}
|
||||
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, regionSize));
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if (mNeedZero) {
|
||||
auto size = outputs[0]->elementSize();
|
||||
if (mTempOutput != nullptr) {
|
||||
size = mTempOutput->elementSize();
|
||||
}
|
||||
size = ((size + 3) / 4) * 4 * sizeof(metal_float);
|
||||
auto blitEncode = [context encoderBlit];
|
||||
[blitEncode fillBuffer:mOutputPtr range:NSMakeRange(0, size) value:0];
|
||||
[blitEncode endEncoding];
|
||||
}
|
||||
for (auto& iter : mTempInput) {
|
||||
backend->onCopyBuffer(iter.first, iter.second.get());
|
||||
}
|
||||
auto encoder = [context encoder];
|
||||
auto bytes = outputs[0]->getType().bytes();
|
||||
NSString* kernelName = nil;
|
||||
switch (bytes) {
|
||||
case 4:
|
||||
|
|
@ -143,21 +161,69 @@ ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std:
|
|||
break;
|
||||
}
|
||||
if (outputs[0]->getType().code == halide_type_float) {
|
||||
kernelName = @"blit_float";
|
||||
#if MNN_METAL_FULL_PRECISION
|
||||
kernelName = @"blit_int32";
|
||||
#else
|
||||
kernelName = @"blit_int16";
|
||||
#endif
|
||||
}
|
||||
auto bandwidth = [context load:kernelName encoder:encoder];
|
||||
mBlitPipeline = [context pipelineWithName:kernelName];
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
if (nullptr == slice.origin) {
|
||||
continue;
|
||||
}
|
||||
SamplerInfo info;
|
||||
writeSamplerInfo(info, slice);
|
||||
auto buffer = [context newDeviceBuffer:sizeof(SamplerInfo) bytes:&info access:CPUWriteOnly];
|
||||
|
||||
auto iter = mTempInput.find(slice.origin);
|
||||
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(slice.size[0], slice.size[1], slice.size[2])];
|
||||
if (iter != mTempInput.end()) {
|
||||
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)iter->second->deviceId(), buffer, local.first, local.second));
|
||||
continue;
|
||||
}
|
||||
mTempInputCopy.emplace_back(std::make_tuple((__bridge id<MTLBuffer>)(void*)slice.origin->deviceId(), buffer, local.first, local.second));
|
||||
}
|
||||
mShapeTemp.clear();
|
||||
for (auto& iter : mTempInput) {
|
||||
id<MTLBuffer> shape = [context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly];
|
||||
mShapeTemp.emplace_back(std::move(shape));
|
||||
}
|
||||
if (nullptr != mTempOutput) {
|
||||
mShapeTemp.emplace_back([context newDeviceBuffer:4*sizeof(int) access:CPUWriteOnly]);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalRaster::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
if (mNeedZero) {
|
||||
backend->flushEncoder();
|
||||
auto size = outputs[0]->elementSize();
|
||||
if (mTempOutput != nullptr) {
|
||||
size = mTempOutput->elementSize();
|
||||
}
|
||||
size = ((size + 3) / 4) * 4 * sizeof(metal_float);
|
||||
auto blitEncode = [context encoderBlit];
|
||||
[blitEncode fillBuffer:mOutputPtr range:NSMakeRange(0, size) value:0];
|
||||
[blitEncode endEncoding];
|
||||
}
|
||||
auto encoder = backend->encoder();
|
||||
int index = 0;
|
||||
for (auto& iter : mTempInput) {
|
||||
backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
|
||||
}
|
||||
[encoder setComputePipelineState:mBlitPipeline];
|
||||
for (auto& iter : mTempInputCopy) {
|
||||
[encoder setBuffer: std::get<0>(iter) offset:0 atIndex: 0];
|
||||
[encoder setBuffer: mOutputPtr offset:0 atIndex: 1];
|
||||
[encoder setBuffer: std::get<1>(iter) offset:0 atIndex: 2];
|
||||
auto& size = std::get<2>(iter);
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger)size[0], (NSUInteger)size[1], (NSUInteger)size[2]}
|
||||
bandwidth:bandwidth];
|
||||
[encoder dispatchThreadgroups:std::get<2>(iter) threadsPerThreadgroup:std::get<3>(iter)];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
if (nullptr != mTempOutput) {
|
||||
backend->onCopyBuffer(mTempOutput.get(), outputs[0]);
|
||||
backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
|
||||
}
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
|
|
|
|||
|
|
@ -31,13 +31,12 @@ ErrorCode MetalReLU::onExecute(const std::vector<Tensor *> &inputs, const std::v
|
|||
}
|
||||
|
||||
MNN_ASSERT(mSlope.length == sizeof(float));
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
auto bandwidth = [context load:simd ? @"relu_x4" : @"relu_x1" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mSlope offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder threads:{ size, 1, 1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,12 +29,11 @@ ErrorCode MetalReLU6::onExecute(const std::vector<Tensor *> &inputs, const std::
|
|||
size /= 4;
|
||||
}
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
auto bandwidth = [context load:simd ? @"relu6_x4" : @"relu6_x1" encoder:encoder];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[context dispatchEncoder:encoder threads:{ size, 1, 1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,15 +18,16 @@ namespace MNN {
|
|||
|
||||
class MetalReduction : public Execution {
|
||||
public:
|
||||
MetalReduction(Backend *backend, const ReductionParam *reduction);
|
||||
MetalReduction(Backend *backend, const ReductionParam *reduction, halide_type_t type);
|
||||
virtual ~MetalReduction() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
NSString *mKernel;
|
||||
std::vector<std::shared_ptr<Tensor>> mMiddles;
|
||||
std::vector<int> mDims;
|
||||
int mAxis;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
id<MTLBuffer> mConst;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -16,106 +16,69 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalReduction::MetalReduction(Backend *backend, const ReductionParam *p) : Execution(backend) {
|
||||
auto integer = p->dType() == DataType_DT_INT32;
|
||||
MetalReduction::MetalReduction(Backend *backend, const ReductionParam *p, halide_type_t type) : Execution(backend) {
|
||||
auto integer = type.code == halide_type_int;
|
||||
NSString *kernel;
|
||||
switch (p->operation()) {
|
||||
case ReductionType_SUM:
|
||||
mKernel = integer ? @"reduce_sum_s" : @"reduce_sum_f";
|
||||
kernel = integer ? @"reduce_sum_s" : @"reduce_sum_f";
|
||||
break;
|
||||
case ReductionType_ASUM:
|
||||
case ReductionType_SUMSQ:
|
||||
MNN_ASSERT(false); // both un-supported
|
||||
break;
|
||||
case ReductionType_MEAN:
|
||||
mKernel = integer ? @"reduce_mean_s" : @"reduce_mean_f";
|
||||
kernel = integer ? @"reduce_mean_s" : @"reduce_mean_f";
|
||||
break;
|
||||
case ReductionType_MAXIMUM:
|
||||
mKernel = integer ? @"reduce_max_s" : @"reduce_max_f";
|
||||
kernel = integer ? @"reduce_max_s" : @"reduce_max_f";
|
||||
break;
|
||||
case ReductionType_MINIMUM:
|
||||
mKernel = integer ? @"reduce_min_s" : @"reduce_min_f";
|
||||
kernel = integer ? @"reduce_min_s" : @"reduce_min_f";
|
||||
break;
|
||||
case ReductionType_PROD:
|
||||
mKernel = integer ? @"reduce_prod_s" : @"reduce_prod_f";
|
||||
kernel = integer ? @"reduce_prod_s" : @"reduce_prod_f";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
for (int i = 0; i < p->dim()->size(); i++) {
|
||||
mDims.push_back(p->dim()->data()[i]);
|
||||
}
|
||||
// The reduce after geometry compute has only one axis
|
||||
mAxis = p->dim()->data()[0];
|
||||
auto mkbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mkbn->context();
|
||||
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
mPipeline = [context pipelineWithName:kernel];
|
||||
}
|
||||
|
||||
ErrorCode MetalReduction::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
if (mDims.size() <= 1)
|
||||
return NO_ERROR;
|
||||
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto &input = inputs[0];
|
||||
mMiddles.clear();
|
||||
for (int i = 0; i < mDims.size() - 1; i++) {
|
||||
auto middle = new Tensor(input->buffer().dimensions);
|
||||
TensorUtils::copyShape(input, middle);
|
||||
for (int j = 0; j <= i; j++) {
|
||||
middle->buffer().dim[mDims[j]].extent = 1;
|
||||
}
|
||||
backend->onAcquireBuffer(middle, Backend::DYNAMIC);
|
||||
mMiddles.push_back(std::shared_ptr<Tensor>(middle));
|
||||
}
|
||||
for (auto &t : mMiddles) {
|
||||
backend->onReleaseBuffer(t.get(), Backend::DYNAMIC);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
static void encode(MNNMetalContext *context, id<MTLComputeCommandEncoder> encoder, MetalBandwidth bandwidth,
|
||||
const Tensor *input, const Tensor *output, int axis) {
|
||||
auto ib = input->buffer();
|
||||
int outsideSize = 1, axisSize = 1, insideSize = 1;
|
||||
if (axis >= 0) {
|
||||
for (int i = 0; i < axis; i++)
|
||||
outsideSize *= ib.dim[i].extent;
|
||||
axisSize = ib.dim[axis].extent;
|
||||
for (int i = axis + 1; i < ib.dimensions; i++)
|
||||
insideSize *= ib.dim[i].extent;
|
||||
} else {
|
||||
axisSize = input->elementSize();
|
||||
for (int i = 0; i < mAxis; i++) {
|
||||
outsideSize *= inputs[0]->length(i);
|
||||
}
|
||||
|
||||
auto shape = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
((int *)shape.contents)[0] = outsideSize;
|
||||
((int *)shape.contents)[1] = axisSize;
|
||||
((int *)shape.contents)[2] = insideSize;
|
||||
((int *)shape.contents)[3] = axisSize * insideSize;
|
||||
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) outsideSize, (NSUInteger)insideSize, 1 }
|
||||
bandwidth:bandwidth];
|
||||
axisSize = inputs[0]->length(mAxis);
|
||||
for (int i = mAxis + 1; i < inputs[0]->dimensions(); i++) {
|
||||
insideSize *= inputs[0]->length(i);
|
||||
}
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
((int *)mConst.contents)[0] = outsideSize;
|
||||
((int *)mConst.contents)[1] = axisSize;
|
||||
((int *)mConst.contents)[2] = insideSize;
|
||||
((int *)mConst.contents)[3] = axisSize * insideSize;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outsideSize, insideSize, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto &input = inputs[0], &output = outputs[0];
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:mKernel encoder:encoder];
|
||||
|
||||
if (mDims.empty()) {
|
||||
encode(context, encoder, bandwidth, input, output, -1);
|
||||
} else if (mDims.size() == 1) {
|
||||
encode(context, encoder, bandwidth, input, output, mDims[0]);
|
||||
} else {
|
||||
encode(context, encoder, bandwidth, input, mMiddles[0].get(), mDims[0]);
|
||||
for (int i = 1; i < mMiddles.size(); i++) {
|
||||
encode(context, encoder, bandwidth, mMiddles[i - 1].get(), mMiddles[i].get(), mDims[i]);
|
||||
}
|
||||
encode(context, encoder, bandwidth, mMiddles.back().get(), output, mDims.back());
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:mConst offset:0 atIndex:2];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
@ -123,7 +86,18 @@ ErrorCode MetalReduction::onExecute(const std::vector<Tensor *> &inputs, const s
|
|||
class MetalReductionCreator : public MetalBackend::Creator {
|
||||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const {
|
||||
return new MetalReduction(backend, op->main_as_ReductionParam());
|
||||
auto param = op->main_as_ReductionParam();
|
||||
switch (param->operation()) {
|
||||
case ReductionType_ALL:
|
||||
case ReductionType_ANY:
|
||||
case ReductionType_ASUM:
|
||||
case ReductionType_SUMSQ:
|
||||
return nullptr;
|
||||
default:
|
||||
break;
|
||||
};
|
||||
|
||||
return new MetalReduction(backend, op->main_as_ReductionParam(), inputs[0]->getType());
|
||||
}
|
||||
};
|
||||
REGISTER_METAL_OP_CREATOR(MetalReductionCreator, OpType_Reduction);
|
||||
|
|
|
|||
|
|
@ -20,11 +20,15 @@ class MetalScale : public Execution {
|
|||
public:
|
||||
MetalScale(Backend *backend, const Scale *scale);
|
||||
virtual ~MetalScale() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
id<MTLBuffer> mScale;
|
||||
id<MTLBuffer> mBias;
|
||||
id<MTLBuffer> mConst;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -17,18 +17,6 @@ struct scale_shape {
|
|||
int batch;
|
||||
};
|
||||
|
||||
kernel void scale_tf(const device ftype *in [[buffer(0)]],
|
||||
device ftype *out [[buffer(1)]],
|
||||
constant scale_shape &s [[buffer(2)]],
|
||||
const device float *scales [[buffer(3)]],
|
||||
const device float *biasTerms [[buffer(4)]],
|
||||
uint2 gid [[thread_position_in_grid]]) {
|
||||
if ((int)gid.x >= s.steps || (int)gid.y >= s.size * s.batch) return;
|
||||
|
||||
out[int(gid.y) * s.steps + int(gid.x)] =
|
||||
in [int(gid.y) * s.steps + int(gid.x)] * ftype(scales[int(gid.x)]) + ftype(biasTerms[int(gid.x)]);
|
||||
}
|
||||
|
||||
kernel void scale_ca(const device ftype4 *in [[buffer(0)]],
|
||||
device ftype4 *out [[buffer(1)]],
|
||||
constant scale_shape &s [[buffer(2)]],
|
||||
|
|
|
|||
|
|
@ -21,50 +21,39 @@ MetalScale::MetalScale(Backend *backend, const Scale *scale) : Execution(backend
|
|||
mBias = scale->biasData()
|
||||
? [context newDeviceBuffer:channel4 * sizeof(float) bytes:scale->biasData()->data() access:CPUWriteOnly]
|
||||
: [context newDeviceBuffer:channel4 * sizeof(float) access:CPUTransparent];
|
||||
mConst = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
mPipeline = [context pipelineWithName:@"scale_ca"];
|
||||
}
|
||||
|
||||
ErrorCode MetalScale::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
ErrorCode MetalScale::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
auto output = outputs[0];
|
||||
|
||||
// shape
|
||||
auto tf = input->getDimensionType() == Tensor::TENSORFLOW;
|
||||
int w = output->width();
|
||||
int h = output->height();
|
||||
int c = output->channel();
|
||||
int z = UP_DIV(c, 4);
|
||||
((int *)mConst.contents)[0] = w*h;
|
||||
((int *)mConst.contents)[1] = z;
|
||||
((int *)mConst.contents)[2] = output->batch();
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(w*h, z * outputs[0]->batch(), 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto shape = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
((int *)shape.contents)[0] = w * h;
|
||||
((int *)shape.contents)[2] = output->batch();
|
||||
ErrorCode MetalScale::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
[encoder setBuffer:mConst offset:0 atIndex:2];
|
||||
[encoder setBuffer:mScale offset:0 atIndex:3];
|
||||
[encoder setBuffer:mBias offset:0 atIndex:4];
|
||||
// tensorflow
|
||||
if (tf) {
|
||||
((int *)shape.contents)[1] = c;
|
||||
|
||||
auto bandwidth = [context load:@"scale_tf" encoder:encoder];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger) c, (NSUInteger)w * h * output->batch(), 1 }
|
||||
bandwidth:bandwidth];
|
||||
}
|
||||
// caffe
|
||||
else {
|
||||
((int *)shape.contents)[1] = z;
|
||||
|
||||
auto bandwidth = [context load:@"scale_ca" encoder:encoder];
|
||||
[context dispatchEncoder:encoder
|
||||
threads:{ (NSUInteger)w * h, (NSUInteger)z * output->batch(), 1 }
|
||||
bandwidth:bandwidth];
|
||||
}
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,9 +21,13 @@ public:
|
|||
MetalUnary(Backend *backend, UnaryOpOperation optype);
|
||||
virtual ~MetalUnary() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
UnaryOpOperation mOpType;
|
||||
id<MTLBuffer> mConstBuffer;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -17,12 +17,12 @@ struct unary_shape {
|
|||
int size;
|
||||
};
|
||||
|
||||
static inline ftype4 neg(ftype4 value) { return -value; }
|
||||
static inline ftype4 square(ftype4 value) { return value * value; }
|
||||
static inline ftype4 expm1(ftype4 value) {return exp(value) - 1;}
|
||||
static inline ftype4 reciprocal(ftype4 value) {return 1.0/(value);}
|
||||
static inline ftype4 sigmoid(ftype4 value) {return 1.f / (1.f + exp(-value));}
|
||||
static inline ftype4 log1p(ftype4 value) {return log(1.f + value);}
|
||||
static inline float4 neg(float4 value) { return -value; }
|
||||
static inline float4 square(float4 value) { return value * value; }
|
||||
static inline float4 expm1(float4 value) {return exp(value) - 1;}
|
||||
static inline float4 reciprocal(float4 value) {return 1.0/(value);}
|
||||
static inline float4 sigmoid(float4 value) {return 1.f / (1.f + exp(-value));}
|
||||
static inline float4 log1p(float4 value) {return log(1.f + value);}
|
||||
|
||||
#define define_op(op) \
|
||||
kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
|
||||
|
|
@ -31,7 +31,7 @@ kernel void unary_##op##_x4(const device ftype4 *in [[buffer(0)]], \
|
|||
uint3 gid [[thread_position_in_grid]]) { \
|
||||
if (gid.x < (uint)s.width) { \
|
||||
int off = gid.z * s.size + gid.y * s.width + gid.x; \
|
||||
out[off] = op(in[off]); \
|
||||
out[off] = (ftype4)(op((float4)(in[off]))); \
|
||||
} \
|
||||
} \
|
||||
|
||||
|
|
|
|||
|
|
@ -14,10 +14,6 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalUnary::MetalUnary(Backend *backend, UnaryOpOperation optype) : Execution(backend), mOpType(optype) {
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
static NSString *kernelForType(UnaryOpOperation type) {
|
||||
#define op_case(type, imp) \
|
||||
case UnaryOpOperation_##type: \
|
||||
|
|
@ -56,26 +52,34 @@ static NSString *kernelForType(UnaryOpOperation type) {
|
|||
}
|
||||
}
|
||||
|
||||
MetalUnary::MetalUnary(Backend *backend, UnaryOpOperation optype) : Execution(backend), mOpType(optype) {
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
mConstBuffer = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
|
||||
mPipeline = [context pipelineWithName:kernelForType(mOpType)];
|
||||
}
|
||||
ErrorCode MetalUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto mtbn = static_cast<MetalBackend *>(backend());
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
auto input = inputs[0];
|
||||
auto element = input->elementSize();
|
||||
auto sizeDiv4 = UP_DIV(element, 4);
|
||||
((int *)mConstBuffer.contents)[0] = sizeDiv4;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(sizeDiv4, 1, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode MetalUnary::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||
|
||||
// prepare
|
||||
auto input = inputs[0], output = outputs[0];
|
||||
auto element = input->elementSize();
|
||||
auto sizeDiv4 = UP_DIV(element, 4);
|
||||
// create shape
|
||||
auto shape = [context newDeviceBuffer:3 * sizeof(int) access:CPUWriteOnly];
|
||||
((int *)shape.contents)[0] = sizeDiv4;
|
||||
|
||||
auto encoder = [context encoder];
|
||||
auto bandwidth = [context load:kernelForType(mOpType) encoder:encoder];
|
||||
auto encoder = backend->encoder();
|
||||
[encoder setComputePipelineState:mPipeline];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)input->deviceId() offset:0 atIndex:0];
|
||||
[encoder setBuffer:(__bridge id<MTLBuffer>)(void *)output->deviceId() offset:0 atIndex:1];
|
||||
[encoder setBuffer:shape offset:0 atIndex:2];
|
||||
[context dispatchEncoder:encoder threads:{ (NSUInteger) sizeDiv4, (NSUInteger)1, (NSUInteger)1 } bandwidth:bandwidth];
|
||||
[encoder endEncoding];
|
||||
MNN_PRINT_ENCODER(context, encoder);
|
||||
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
|
||||
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,9 @@ CLRuntime::CLRuntime(const Backend::Info& info){
|
|||
} else {
|
||||
mOpenCLRuntime.reset(new OpenCLRuntime(false));
|
||||
}
|
||||
if(mOpenCLRuntime.get()){
|
||||
|
||||
mCLRuntimeError = mOpenCLRuntime->isCreateError();
|
||||
if(!mCLRuntimeError){
|
||||
mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
|
||||
mStaticImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
|
||||
mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
|
||||
|
|
@ -69,6 +71,10 @@ void CLRuntime::onGabageCollect(int level) {
|
|||
//nothing now
|
||||
}
|
||||
|
||||
bool CLRuntime::isCLRuntimeError() {
|
||||
return mCLRuntimeError;
|
||||
}
|
||||
|
||||
std::map<OpType, OpenCLBackend::Creator*>* gCreator() {
|
||||
static std::once_flag once;
|
||||
static std::map<OpType, OpenCLBackend::Creator*>* creators = nullptr;
|
||||
|
|
@ -574,7 +580,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
|
|||
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
|
||||
}
|
||||
#else
|
||||
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_FALSE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
|
||||
mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, srcTensor->elementSize()*sizeof(float), hostPtr);
|
||||
#endif
|
||||
// Host -> OpenCL
|
||||
MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
|
||||
|
|
@ -658,7 +664,12 @@ class CLRuntimeCreator : public RuntimeCreator {
|
|||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
return new CLRuntime(info);
|
||||
auto rt = new CLRuntime(info);
|
||||
if(rt->isCLRuntimeError() == true) {
|
||||
delete rt;
|
||||
return nullptr;
|
||||
}
|
||||
return rt;
|
||||
}
|
||||
virtual bool onValid(Backend::Info& info) const {
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -77,7 +77,8 @@ public:
|
|||
virtual void onGabageCollect(int level) override;
|
||||
virtual std::pair<const void*, size_t> onGetCache() override;
|
||||
virtual bool onSetCache(const void* buffer, size_t size) override;
|
||||
|
||||
bool isCLRuntimeError();
|
||||
|
||||
private:
|
||||
Backend::Info mInfo;
|
||||
std::shared_ptr<ImagePool> mImagePool;
|
||||
|
|
@ -87,6 +88,7 @@ private:
|
|||
std::shared_ptr<OpenCLRuntime> mOpenCLRuntime;
|
||||
|
||||
BackendConfig::PrecisionMode mPrecision;
|
||||
bool mCLRuntimeError = false;
|
||||
|
||||
friend class OpenCLBackend;
|
||||
|
||||
|
|
|
|||
|
|
@ -45,12 +45,12 @@ OpenCLRuntime::OpenCLRuntime(bool permitFloat16) {
|
|||
std::vector<cl::Platform> platforms;
|
||||
cl_int res = cl::Platform::get(&platforms);
|
||||
MNN_CHECK_CL_SUCCESS(res);
|
||||
if(platforms.size() > 0){
|
||||
if(platforms.size() > 0 && res == CL_SUCCESS){
|
||||
cl::Platform::setDefault(platforms[0]);
|
||||
std::vector<cl::Device> gpuDevices;
|
||||
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
|
||||
res = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &gpuDevices);
|
||||
|
||||
if(1 <= gpuDevices.size()){
|
||||
if(1 <= gpuDevices.size() && res == CL_SUCCESS){
|
||||
mFirstGPUDevicePtr = std::make_shared<cl::Device>(gpuDevices[0]);
|
||||
const std::string deviceName = mFirstGPUDevicePtr->getInfo<CL_DEVICE_NAME>();
|
||||
const std::string deviceVersion = mFirstGPUDevicePtr->getInfo<CL_DEVICE_VERSION>();
|
||||
|
|
|
|||
|
|
@ -248,7 +248,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, uint32_t *len) {
|
|||
return blob;
|
||||
}
|
||||
|
||||
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len) {
|
||||
static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len, const flatbuffers::Vector<float> *alpha) {
|
||||
// MNN_ERROR("sparse:%d\n", 1);
|
||||
unsigned short shape[64] = {0};
|
||||
uint32_t ucMapSize = 0;
|
||||
|
|
@ -332,7 +332,21 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, uint32_t *len) {
|
|||
}
|
||||
// set blob data with idx and weight idx
|
||||
{
|
||||
memset(blob, 0, Size * sizeof(signed char));
|
||||
if (alpha->size() == 2 * shape[0]) {
|
||||
auto alphaPtr = alpha->data();
|
||||
int area = Size / shape[0];
|
||||
for (int i = 0; i < shape[0]; i++) {
|
||||
float min = alphaPtr[2*i];
|
||||
float scale = alphaPtr[2*i+1];
|
||||
int zeroQuant = -128;
|
||||
if (scale > 1e-6) {
|
||||
zeroQuant = round((0.0f - min) / scale) + (-128);
|
||||
}
|
||||
memset(blob+area*i, zeroQuant, area * sizeof(signed char));
|
||||
}
|
||||
} else {
|
||||
memset(blob, 0, Size * sizeof(signed char)); //backward compability with previous symmetric weight quant
|
||||
}
|
||||
int iPreIdx = 0;
|
||||
for (int i = 0; i < nnz; i++) {
|
||||
iPreIdx += arrIdx[i];
|
||||
|
|
@ -358,7 +372,7 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
|
|||
buffer = ReadQuanData_c(originBuffer, &weightLength);
|
||||
}
|
||||
if (2 == quan->type()) {
|
||||
buffer = ReadSparseQuanData_c(originBuffer, &weightLength);
|
||||
buffer = ReadSparseQuanData_c(originBuffer, &weightLength, quan->alpha());
|
||||
}
|
||||
// read fp16 data
|
||||
if (3 == quan->type()) {
|
||||
|
|
@ -424,14 +438,27 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const IDS
|
|||
MNN_PRINT("Alloc memory error for extract idst int8/ Back to float\n");
|
||||
return nullptr;
|
||||
}
|
||||
auto outputCount = result->alpha.size();
|
||||
int outputCount = 0;
|
||||
if (quan->readType() != 0) {
|
||||
outputCount = result->alpha.size() / 2;
|
||||
} else {
|
||||
outputCount = result->alpha.size(); // backward compability with previous symmetric quantization
|
||||
}
|
||||
int partWeightSize = weightLength / outputCount;
|
||||
for (int o = 0; o < outputCount; ++o) {
|
||||
auto dstW = result->weightFloat.get() + o * partWeightSize;
|
||||
auto srcW = result->weight.get() + o * partWeightSize;
|
||||
float alpha = result->alpha.get()[o];
|
||||
for (int j = 0; j < partWeightSize; ++j) {
|
||||
dstW[j] = ((float)srcW[j]) * alpha * quan->quantScale();
|
||||
if (result->alpha.size() == 2 * outputCount) {
|
||||
float min = result->alpha.get()[2*o];
|
||||
float alpha = result->alpha.get()[2*o+1];
|
||||
for (int j = 0; j < partWeightSize; ++j) {
|
||||
dstW[j] = (( (float)srcW[j] - (-128) ) * alpha + min) * quan->quantScale();
|
||||
}
|
||||
} else {
|
||||
float alpha = result->alpha.get()[o];
|
||||
for (int j = 0; j < partWeightSize; ++j) {
|
||||
dstW[j] = ((float)srcW[j]) * alpha * quan->quantScale();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -32,16 +32,16 @@ Session::Session(Schedule::ScheduleInfo&& info, Interpreter::SessionMode callBac
|
|||
defaultInfo.numThread = 1;
|
||||
mTensors = std::move(info.allTensors);
|
||||
for (auto& iter : info.pipelineInfo) {
|
||||
auto runtime = mRuntime.first.find(iter.first.type)->second.get();
|
||||
auto rt = mRuntime.first.find(iter.first.type)->second.get();
|
||||
auto cpuRuntime = mRuntime.second;
|
||||
std::shared_ptr<Backend> first(runtime->onCreate());
|
||||
std::shared_ptr<Backend> first(rt->onCreate());
|
||||
std::shared_ptr<Backend> second;
|
||||
if (first->type() == MNN_FORWARD_CPU) {
|
||||
second = first;
|
||||
} else {
|
||||
second.reset(cpuRuntime->onCreate());
|
||||
}
|
||||
std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, runtime->onGetCompilerType() == Runtime::Compiler_Geometry));
|
||||
std::shared_ptr<Pipeline> newPipeline(new Pipeline(std::move(iter.second), first, second, inputMode == Interpreter::Session_Input_Inside, rt->onGetCompilerType() == Runtime::Compiler_Geometry));
|
||||
mPipelines.emplace_back(std::move(newPipeline));
|
||||
}
|
||||
mInputs = std::move(info.inputTensors);
|
||||
|
|
|
|||
|
|
@ -469,7 +469,7 @@ bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::Ins
|
|||
newSrc[index] = srcSrc[i];
|
||||
}
|
||||
if (dstSize.size() > sizeNum) {
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
for (int i = 2; i >= 0; i--) {
|
||||
dstReg.size[i] = i < dstSize.size() ? dstSize[i] : 1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,8 +36,8 @@ static void _ConverterInterp(const Interp* resize, InterpT* dstInfo, int inW, in
|
|||
switch (resize->ctm()) {
|
||||
case CoordinateTransformationMode_NotSet:
|
||||
{
|
||||
// For compability
|
||||
if (resize->halfPixelCenters()) {
|
||||
// For compability, old model's nearest don't support halfpixels
|
||||
if (resize->halfPixelCenters() && resize->resizeType() != 1) {
|
||||
dstInfo->heightScale = (float)(inH) / (float)(outH);
|
||||
dstInfo->widthScale = (float)(inW) / (float)(outW);
|
||||
dstInfo->widthOffset = 0.5f * dstInfo->widthScale - 0.5f;
|
||||
|
|
@ -194,8 +194,17 @@ public:
|
|||
static void _create() {
|
||||
std::shared_ptr<GeometryComputer> comp(new GeometryImageOp);
|
||||
GeometryComputer::registerGeometryComputer(
|
||||
comp, {OpType_ConvInt8, OpType_ConvolutionDepthwise, OpType_DeconvolutionDepthwise,
|
||||
OpType_Pooling, OpType_Interp, OpType_Resize, OpType_Int8ToFloat, OpType_FloatToInt8});
|
||||
comp, {
|
||||
OpType_ConvInt8,
|
||||
OpType_DepthwiseConvInt8,
|
||||
OpType_ConvolutionDepthwise,
|
||||
OpType_DeconvolutionDepthwise,
|
||||
OpType_Pooling,
|
||||
OpType_Interp,
|
||||
OpType_Resize,
|
||||
OpType_Int8ToFloat,
|
||||
OpType_FloatToInt8
|
||||
});
|
||||
}
|
||||
|
||||
REGISTER_GEOMETRY(GeometryImageOp, _create);
|
||||
|
|
|
|||
|
|
@ -27,6 +27,9 @@ class ConcatSizeComputer : public SizeComputer {
|
|||
// Concat-inputs may have scalar which should be delete
|
||||
for (const auto& input : inputs) {
|
||||
auto inputDimensions = input->buffer().dimensions;
|
||||
if (input->size() <= 0) {
|
||||
continue;
|
||||
}
|
||||
::memcpy(ob.dim, input->buffer().dim, sizeof(halide_dimension_t) * inputDimensions);
|
||||
ob.dimensions = inputDimensions;
|
||||
ob.type = input->buffer().type;
|
||||
|
|
|
|||
|
|
@ -15,20 +15,46 @@ class ArgMaxTest : public MNNTestCase {
|
|||
public:
|
||||
virtual ~ArgMaxTest() = default;
|
||||
virtual bool run() {
|
||||
auto input = _Input({4, 4}, NHWC);
|
||||
input->setName("input_tensor");
|
||||
auto ArgMax_ = [](VARP input, int axis, int topK, int outMaxVal) {
|
||||
using namespace MNN;
|
||||
// input = _checkNC4HW4(input);
|
||||
std::unique_ptr<OpT> op(new OpT);
|
||||
op->main.type = OpParameter_ArgMax;
|
||||
op->type = OpType_ArgMax;
|
||||
op->main.value = new ArgMaxT;
|
||||
op->main.AsArgMax()->axis = axis;
|
||||
op->main.AsArgMax()->outMaxVal = outMaxVal;
|
||||
op->main.AsArgMax()->topK = topK;
|
||||
op->main.AsArgMax()->softmaxThreshold = 0;
|
||||
return (Variable::create(Expr::create(std::move(op), {input})));
|
||||
};
|
||||
auto input_nhwc = _Input({4, 4}, NHWC);
|
||||
auto input_nchw = _Input({4, 4}, NC4HW4);
|
||||
input_nhwc->setName("input_tensor_nhwc");
|
||||
input_nchw->setName("input_tensor_nchw");
|
||||
// set input data
|
||||
const float inpudata[] = {-1.0, 2.0, -3.0, 4.0, 5.0, -6.0, 7.0, -8.0,
|
||||
-9.0, -10.0, 11.0, 12.0, 13.0, 14.0, -15.0, -16.0};
|
||||
auto inputPtr = input->writeMap<float>();
|
||||
const float inpudata[] = {-1.0, 2.0, -3.0, 4.0,
|
||||
5.0, -6.0, 7.0, -8.0,
|
||||
-9.0, -10.0, 11.0, 12.0,
|
||||
13.0, 14.0, -15.0, -16.0};
|
||||
auto inputPtr = input_nhwc->writeMap<float>();
|
||||
memcpy(inputPtr, inpudata, 16 * sizeof(float));
|
||||
input->unMap();
|
||||
auto output_0 = _ArgMax(input, 0);
|
||||
auto output_1 = _ArgMax(input, 1);
|
||||
inputPtr = input_nchw->writeMap<float>();
|
||||
memcpy(inputPtr, inpudata, 16 * sizeof(float));
|
||||
input_nhwc->unMap();
|
||||
input_nchw->unMap();
|
||||
auto output_0 = _ArgMax(input_nhwc, 0);
|
||||
auto output_1 = _ArgMax(input_nhwc, 1);
|
||||
auto output_2 = ArgMax_(input_nchw, 1, 2, 0);
|
||||
auto output_3 = ArgMax_(input_nchw, 1, 1, 1);
|
||||
const std::vector<int> expectedOutput_0 = {3, 3, 2, 2};
|
||||
const std::vector<int> expectedOutput_1 = {3, 2, 3, 1};
|
||||
const std::vector<float> expectedOutput_2 = {3, 1, 2, 0, 3, 2, 1, 0};
|
||||
const std::vector<float> expectedOutput_3 = {3, 4, 2, 7, 3, 12, 1, 14};
|
||||
auto gotOutput_0 = output_0->readMap<int>();
|
||||
auto gotOutput_1 = output_1->readMap<int>();
|
||||
auto gotOutput_2 = output_2->readMap<float>();
|
||||
auto gotOutput_3 = output_3->readMap<float>();
|
||||
if (!checkVector<int>(gotOutput_0, expectedOutput_0.data(), 4, 0)) {
|
||||
MNN_ERROR("ArgMaxTest test axis_0 failed!\n");
|
||||
return false;
|
||||
|
|
@ -37,6 +63,14 @@ public:
|
|||
MNN_ERROR("ArgMaxTest test axis_1 failed!\n");
|
||||
return false;
|
||||
}
|
||||
if (!checkVector<float>(gotOutput_2, expectedOutput_2.data(), 8, 0)) {
|
||||
MNN_ERROR("ArgMaxTest test axis_1_top2 failed!\n");
|
||||
return false;
|
||||
}
|
||||
if (!checkVector<float>(gotOutput_3, expectedOutput_3.data(), 8, 0)) {
|
||||
MNN_ERROR("ArgMaxTest test axis_1_outVal failed!\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -98,4 +98,40 @@ public:
|
|||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class MatMulSpeedConstTest : public MNNTestCase {
|
||||
public:
|
||||
virtual bool run() {
|
||||
int e = 540, h = 540, l = 320;
|
||||
auto res = _runConst(e, h, l);
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
return _runConst(1024, 1024, 1024);
|
||||
}
|
||||
|
||||
bool _runConst(int e, int h, int l) {
|
||||
{
|
||||
// Use Conv1x1 instead of MatMul
|
||||
auto x0 = _Input({1, l, 1, h}, NC4HW4, halide_type_of<float>());
|
||||
auto y = _Conv(0.0f, 0.0f, x0, {l, e}, {1, 1});
|
||||
Variable::prepareCompute({y});
|
||||
const auto time = 100;
|
||||
MNN_PRINT("MatMul B Const (Conv1x1): [%d, %d, %d], run %d\n", h, l, e, time);
|
||||
{
|
||||
AUTOTIME;
|
||||
//Prepare
|
||||
x0->writeMap<float>();
|
||||
y->readMap<float>();
|
||||
}
|
||||
AUTOTIME;
|
||||
for (int t = 0; t < time; ++t) {
|
||||
x0->writeMap<float>();
|
||||
y->readMap<float>();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
MNNTestSuiteRegister(MatMulSpeedTest, "speed/MatMulTest");
|
||||
MNNTestSuiteRegister(MatMulSpeedConstTest, "speed/MatMulBConstTest");
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ public:
|
|||
bool saveHalfFloat;
|
||||
bool forTraining = false;
|
||||
int weightQuantBits = 0;// If weightQuantBits > 0, it means the bit
|
||||
bool weightQuantAsymmetric = false;
|
||||
// The path of the model compression file that stores the int8 calibration table
|
||||
// or sparse parameters.
|
||||
std::string compressionParamsFile = "";
|
||||
|
|
|
|||
|
|
@ -46,6 +46,9 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
|
|||
"debug", "Enable debugging mode.")(
|
||||
"forTraining", "whether or not to save training ops BN and Dropout, default: false", cxxopts::value<bool>())(
|
||||
"weightQuantBits", "save conv/matmul/LSTM float weights to int8 type, only optimize for model size, 2-8 bits, default: 0, which means no weight quant", cxxopts::value<int>())(
|
||||
"weightQuantAsymmetric", "the default weight-quant uses SYMMETRIC quant method, which is compatible with old MNN versions. "
|
||||
"you can try set --weightQuantAsymmetric to use asymmetric quant method to improve accuracy of the weight-quant model in some cases, "
|
||||
"but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. default: false", cxxopts::value<bool>())(
|
||||
"compressionParamsFile",
|
||||
"The path of the compression parameters that stores activation, "
|
||||
"weight scales and zero points for quantization or information "
|
||||
|
|
@ -159,6 +162,9 @@ cxxopts::Options Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc,
|
|||
if (result.count("weightQuantBits")) {
|
||||
modelPath.weightQuantBits = result["weightQuantBits"].as<int>();
|
||||
}
|
||||
if (result.count("weightQuantAsymmetric")) {
|
||||
modelPath.weightQuantAsymmetric = true;
|
||||
}
|
||||
if (result.count("saveStaticModel")) {
|
||||
modelPath.saveStaticModel = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@
|
|||
#include "writeFb.hpp"
|
||||
#include "cpp/ConfigFile.hpp"
|
||||
#include <MNN/MNNDefine.h>
|
||||
#include "cli.hpp"
|
||||
#include "../../common/Global.hpp"
|
||||
|
||||
using namespace MNN;
|
||||
using namespace std;
|
||||
|
|
@ -33,6 +35,23 @@ static float findAbsMax(const float *weights, const int count) {
|
|||
return absMax;
|
||||
}
|
||||
|
||||
static std::vector<float> findMinMax(const float *weights, const int count) {
|
||||
float min = weights[0];
|
||||
float max = weights[0];
|
||||
|
||||
for (int i = 1; i < count; i++) {
|
||||
float value = weights[i];
|
||||
if (value > max) {
|
||||
max = value;
|
||||
}
|
||||
if (value < min) {
|
||||
min = value;
|
||||
}
|
||||
}
|
||||
|
||||
return {min, max};
|
||||
}
|
||||
|
||||
static void WriteBlobDim(ostream &out, std::vector<int> dims)
|
||||
{
|
||||
char tmp[4];
|
||||
|
|
@ -44,6 +63,7 @@ static void WriteBlobDim(ostream &out, std::vector<int> dims)
|
|||
out.write((const char*)(&tmpShort), 2);
|
||||
}
|
||||
}
|
||||
|
||||
static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigned int arr_len, unsigned char iNeedBits)
|
||||
{
|
||||
memset(buf, 0, buf_len);
|
||||
|
|
@ -72,55 +92,111 @@ static void FillBuffer(char *buf, unsigned int buf_len, const char *arr, unsigne
|
|||
}
|
||||
}
|
||||
|
||||
static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel)
|
||||
static void GetWeightSet(set<int> &setWeight, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
|
||||
{
|
||||
setWeight.clear();
|
||||
for (int i = 0; i < channel; i++)
|
||||
{
|
||||
float alpha = alphaData[i];
|
||||
if (alpha <= 1e-6f)
|
||||
if (asymmetricQuantFlag) {
|
||||
for (int i = 0; i < channel; i++)
|
||||
{
|
||||
setWeight.insert(0);
|
||||
continue;
|
||||
float min = alphaData[2*i];
|
||||
float alpha = alphaData[2*i+1];
|
||||
if (alpha <= 1e-6f)
|
||||
{
|
||||
setWeight.insert(-128);
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < area; j++)
|
||||
{
|
||||
float weight = weightData[i * area + j];
|
||||
setWeight.insert(round((weight - min) / alpha) + (-128));
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < area; j++)
|
||||
} else {
|
||||
for (int i = 0; i < channel; i++)
|
||||
{
|
||||
float weight = weightData[i * area + j];
|
||||
setWeight.insert(round(weight / alpha));
|
||||
float alpha = alphaData[i];
|
||||
if (alpha <= 1e-6f)
|
||||
{
|
||||
setWeight.insert(0);
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < area; j++)
|
||||
{
|
||||
float weight = weightData[i * area + j];
|
||||
setWeight.insert(round(weight / alpha));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, int iMaxStep = -1)
|
||||
static float GetSparsity(const float* weightData, int weightSize, unsigned int& nnz, const float* alphaData, int area, int channel, bool asymmetricQuantFlag, int iMaxStep = -1)
|
||||
{
|
||||
nnz = 0;
|
||||
int iPreIdx = 0;
|
||||
float sparsity;
|
||||
for (int i = 0; i < weightSize; i++)
|
||||
{
|
||||
if (fabs(weightData[i]) > 1e-8f)
|
||||
{
|
||||
nnz++;
|
||||
iPreIdx = i;
|
||||
}
|
||||
if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
|
||||
{
|
||||
nnz++;
|
||||
iPreIdx = i;
|
||||
}
|
||||
}
|
||||
if (asymmetricQuantFlag) {
|
||||
for (int i = 0; i < weightSize; i++)
|
||||
{
|
||||
float min = alphaData[2*(i/area)];
|
||||
float alpha = alphaData[2*(i/area)+1];
|
||||
int zeroQuant = -128;
|
||||
if (alpha > 1e-6) {
|
||||
zeroQuant = round((0.0f - min) / alpha) + (-128);
|
||||
}
|
||||
|
||||
float weight = weightData[i];
|
||||
int value = -128;
|
||||
if (alpha > 1e-6)
|
||||
{
|
||||
value = round((weight - min) / alpha) + (-128);
|
||||
}
|
||||
|
||||
if (value != zeroQuant)
|
||||
{
|
||||
nnz++;
|
||||
iPreIdx = i;
|
||||
}
|
||||
if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
|
||||
{
|
||||
nnz++;
|
||||
iPreIdx = i;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < weightSize; i++)
|
||||
{
|
||||
float alpha = alphaData[i / area];
|
||||
float weight = weightData[i];
|
||||
int value = 0;
|
||||
if (alpha > 1e-6f)
|
||||
{
|
||||
value = round(weight / alpha);
|
||||
}
|
||||
|
||||
if (value != 0)
|
||||
{
|
||||
nnz++;
|
||||
iPreIdx = i;
|
||||
}
|
||||
if ((i - iPreIdx >= iMaxStep) && (iMaxStep != -1))
|
||||
{
|
||||
nnz++;
|
||||
iPreIdx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
sparsity = 1 - 1.0f * nnz / weightSize;
|
||||
return sparsity;
|
||||
}
|
||||
|
||||
unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize)
|
||||
unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned char& iMaxStepBits, int BlobDataSize, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
|
||||
{
|
||||
size_t szBestSize = 1000000000;
|
||||
unsigned int best_nnz = 0;
|
||||
for (int i = 2; i < 9; i++)
|
||||
{
|
||||
unsigned int nnz = 0;
|
||||
GetSparsity(weightData, weightSize, nnz, pow(2, i) - 1);
|
||||
GetSparsity(weightData, weightSize, nnz, alphaData, area, channel, asymmetricQuantFlag, pow(2, i) - 1);
|
||||
size_t tmp = ceil(0.125 * nnz * i) + ceil(0.125 * nnz * BlobDataSize);
|
||||
if (tmp < szBestSize)
|
||||
{
|
||||
|
|
@ -132,12 +208,12 @@ unsigned int GetBestMaxStep(const float* weightData, int weightSize, unsigned ch
|
|||
return best_nnz;
|
||||
}
|
||||
|
||||
static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel)
|
||||
static void WriteCQBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
|
||||
{
|
||||
//push values into buffer
|
||||
//Find int values in all blobs and check;
|
||||
set<int> setWeight;
|
||||
GetWeightSet(setWeight, weightData, alphaData, area, channel);
|
||||
GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
|
||||
int iCount = setWeight.size();
|
||||
int iNeedBits = ceil(log2(iCount));
|
||||
if (iNeedBits > 8) {
|
||||
|
|
@ -155,19 +231,38 @@ static void WriteCQBlobs(ostream &out, const float* weightData, const float* alp
|
|||
{
|
||||
char *arr = new char[area * channel];
|
||||
char *tmp = arr;
|
||||
for (int i = 0; i < channel; i++)
|
||||
{
|
||||
float alpha = alphaData[i];
|
||||
for (int j = 0; j < area; j++)
|
||||
if (asymmetricQuantFlag) {
|
||||
for (int i = 0; i < channel; i++)
|
||||
{
|
||||
float weight = weightData[i * area + j];
|
||||
int value = 0;
|
||||
if (alpha > 1e-6f)
|
||||
float min = alphaData[2*i];
|
||||
float alpha = alphaData[2*i+1];
|
||||
for (int j = 0; j < area; j++)
|
||||
{
|
||||
value = round(weight / alpha);
|
||||
float weight = weightData[i * area + j];
|
||||
int value = -128;
|
||||
if (alpha > 1e-6f)
|
||||
{
|
||||
value = round((weight - min) / alpha) + (-128);
|
||||
}
|
||||
*tmp = mapWeight[value];
|
||||
tmp++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < channel; i++)
|
||||
{
|
||||
float alpha = alphaData[i];
|
||||
for (int j = 0; j < area; j++)
|
||||
{
|
||||
float weight = weightData[i * area + j];
|
||||
int value = 0;
|
||||
if (alpha > 1e-6f)
|
||||
{
|
||||
value = round(weight / alpha);
|
||||
}
|
||||
*tmp = mapWeight[value];
|
||||
tmp++;
|
||||
}
|
||||
*tmp = mapWeight[value];
|
||||
tmp++;
|
||||
}
|
||||
}
|
||||
FillBuffer(buf, buf_len, arr, area * channel, iNeedBits);
|
||||
|
|
@ -194,10 +289,10 @@ static void WriteCQBlobs(ostream &out, const float* weightData, const float* alp
|
|||
delete[] buf;
|
||||
}
|
||||
|
||||
static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel)
|
||||
static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const float* alphaData, int area, int channel, bool asymmetricQuantFlag)
|
||||
{
|
||||
set<int> setWeight;
|
||||
GetWeightSet(setWeight, weightData, alphaData, area, channel);
|
||||
GetWeightSet(setWeight, weightData, alphaData, area, channel, asymmetricQuantFlag);
|
||||
int iDataNeedBits = ceil(log2(setWeight.size()));
|
||||
unsigned int nnz = 0;
|
||||
int weightSize = area * channel;
|
||||
|
|
@ -210,7 +305,7 @@ static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const fl
|
|||
}
|
||||
}
|
||||
unsigned char iNeedBits;
|
||||
nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits);
|
||||
nnz = GetBestMaxStep(weightData, weightSize, iNeedBits, iDataNeedBits, alphaData, area, channel, asymmetricQuantFlag);
|
||||
//weight buf
|
||||
size_t data_buf_len = size_t(ceil(0.125 * iDataNeedBits * nnz));
|
||||
char* data_buf = new char[data_buf_len];
|
||||
|
|
@ -224,33 +319,69 @@ static void WriteSparseQuanBlobs(ostream &out, const float* weightData, const fl
|
|||
int iMaxStep = pow(2, iNeedBits) - 1;
|
||||
int iPreIdx = 0;
|
||||
unsigned char* dTmp = data_arr;
|
||||
for (int i = 0; i < weightSize; i++)
|
||||
{
|
||||
float alpha = alphaData[i / area];
|
||||
float weight = weightData[i];
|
||||
int value = 0;
|
||||
if (alpha > 1e-6f)
|
||||
if (asymmetricQuantFlag) {
|
||||
for (int i = 0; i < weightSize; i++)
|
||||
{
|
||||
value = round(weight / alpha);
|
||||
}
|
||||
float min = alphaData[2*(i/area)];
|
||||
float alpha = alphaData[2*(i/area)+1];
|
||||
int zeroQuant = -128;
|
||||
if (alpha > 1e-6) {
|
||||
zeroQuant = round((0.0f - min) / alpha) + (-128);
|
||||
}
|
||||
|
||||
if (value != 0)
|
||||
{
|
||||
*dTmp = mapWeight[value];
|
||||
*tmp = i - iPreIdx;
|
||||
iPreIdx = i;
|
||||
tmp++;
|
||||
dTmp++;
|
||||
}
|
||||
if (i - iPreIdx >= iMaxStep)
|
||||
{
|
||||
*dTmp = mapWeight[0];
|
||||
*tmp = i - iPreIdx;
|
||||
iPreIdx = i;
|
||||
tmp++;
|
||||
dTmp++;
|
||||
}
|
||||
}
|
||||
float weight = weightData[i];
|
||||
int value = -128;
|
||||
if (alpha > 1e-6)
|
||||
{
|
||||
value = round((weight - min) / alpha) + (-128);
|
||||
}
|
||||
|
||||
if (value != zeroQuant)
|
||||
{
|
||||
*dTmp = mapWeight[value];
|
||||
*tmp = i - iPreIdx;
|
||||
iPreIdx = i;
|
||||
tmp++;
|
||||
dTmp++;
|
||||
}
|
||||
if (i - iPreIdx >= iMaxStep)
|
||||
{
|
||||
*dTmp = mapWeight[zeroQuant];
|
||||
*tmp = i - iPreIdx;
|
||||
iPreIdx = i;
|
||||
tmp++;
|
||||
dTmp++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < weightSize; i++)
|
||||
{
|
||||
float alpha = alphaData[i / area];
|
||||
float weight = weightData[i];
|
||||
int value = 0;
|
||||
if (alpha > 1e-6f)
|
||||
{
|
||||
value = round(weight / alpha);
|
||||
}
|
||||
|
||||
if (value != 0)
|
||||
{
|
||||
*dTmp = mapWeight[value];
|
||||
*tmp = i - iPreIdx;
|
||||
iPreIdx = i;
|
||||
tmp++;
|
||||
dTmp++;
|
||||
}
|
||||
if (i - iPreIdx >= iMaxStep)
|
||||
{
|
||||
*dTmp = mapWeight[0];
|
||||
*tmp = i - iPreIdx;
|
||||
iPreIdx = i;
|
||||
tmp++;
|
||||
dTmp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
FillBuffer(buf, buf_len, (char*) arr_idx, nnz, iNeedBits);
|
||||
FillBuffer(data_buf, data_buf_len, (char*) data_arr, nnz, iDataNeedBits);
|
||||
delete[] arr_idx;
|
||||
|
|
@ -392,6 +523,8 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
|
|||
}
|
||||
|
||||
auto CastParamsToInt8 = [](std::unique_ptr<MNN::OpT>& op, int bits) {
|
||||
auto gConverterConfig = Global<modelConfig>::Get();
|
||||
bool asymmetricQuantFlag = gConverterConfig->weightQuantAsymmetric;
|
||||
const auto opType = op->type;
|
||||
// Bits must from 2-8
|
||||
bits = std::max(bits, 2);
|
||||
|
|
@ -408,20 +541,35 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
|
|||
const int weightSize = param->weight.size();
|
||||
int kernelNum = common->outputCount;
|
||||
int kernelSize = weightSize / kernelNum;
|
||||
|
||||
std::vector<float> scales(kernelNum);
|
||||
|
||||
auto weightData = param->weight.data();
|
||||
for (int k = 0; k < kernelNum; k++) {
|
||||
int beginIndex = k * kernelSize;
|
||||
auto absMax = findAbsMax(weightData + beginIndex, kernelSize);
|
||||
// TODO: Support low bit
|
||||
scales[k] = absMax / thredhold;
|
||||
|
||||
std::vector<float> scales;
|
||||
if (asymmetricQuantFlag) {
|
||||
scales.resize(kernelNum*2);
|
||||
for (int k = 0; k < kernelNum; k++) {
|
||||
int beginIndex = k * kernelSize;
|
||||
auto minAndMax = findMinMax(weightData + beginIndex, kernelSize);
|
||||
float min = minAndMax[0];
|
||||
float max = minAndMax[1];
|
||||
float scale = (max - min) / (127 + 128);
|
||||
|
||||
scales[2*k] = min;
|
||||
scales[2*k+1] = scale;
|
||||
}
|
||||
} else {
|
||||
scales.resize(kernelNum);
|
||||
for (int k = 0; k < kernelNum; k++) {
|
||||
int beginIndex = k * kernelSize;
|
||||
auto absMax = findAbsMax(weightData + beginIndex, kernelSize);
|
||||
|
||||
scales[k] = absMax / thredhold;
|
||||
}
|
||||
}
|
||||
|
||||
std::ostringstream outputStringStreamCQ;
|
||||
WriteCQBlobs(outputStringStreamCQ, weightData, scales.data(), kernelSize, kernelNum);
|
||||
WriteCQBlobs(outputStringStreamCQ, weightData, scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
|
||||
std::ostringstream outputStringStreamSQ;
|
||||
WriteSparseQuanBlobs(outputStringStreamSQ, weightData, scales.data(), kernelSize, kernelNum);
|
||||
WriteSparseQuanBlobs(outputStringStreamSQ, weightData, scales.data(), kernelSize, kernelNum, asymmetricQuantFlag);
|
||||
|
||||
param->quanParameter.reset(new MNN::IDSTQuanT);
|
||||
auto tempString = outputStringStreamCQ.str();
|
||||
|
|
@ -435,6 +583,9 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, m
|
|||
::memcpy(param->quanParameter->buffer.data(), tempString.data(), tempString.size());
|
||||
param->quanParameter->alpha = std::move(scales);
|
||||
param->quanParameter->quantScale = 1.0f;
|
||||
if (asymmetricQuantFlag) {
|
||||
param->quanParameter->readType = kernelNum;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -418,13 +418,14 @@ bool fuseConstIntoSubgraph(MNN::NetT* net, const std::vector<MNN::SubGraphProtoT
|
|||
}
|
||||
// Try Optimize Subgraph for more const op get
|
||||
auto* ctx = Global<OptimizeContext>::Get();
|
||||
std::unordered_map<std::string, VARP> empty;
|
||||
for (auto mutable_subgraph : modifiedSubGraph) {
|
||||
std::unique_ptr<MNN::NetT> subnet(new MNN::NetT);
|
||||
subnet->oplists = std::move(mutable_subgraph->nodes);
|
||||
subnet->tensorName = std::move(mutable_subgraph->tensors);
|
||||
subnet->sourceType = ctx->source;
|
||||
|
||||
std::unique_ptr<MNN::NetT> new_subnet = optimizeNetImpl(subnet, {}, ctx->is_train, false /*verbose*/);
|
||||
std::unique_ptr<MNN::NetT> new_subnet = optimizeNetImpl(subnet, empty, ctx->is_train, false /*verbose*/);
|
||||
mutable_subgraph->nodes = std::move(subnet->oplists);
|
||||
|
||||
MNN::SubGraphProtoT* new_subgraph = mutable_subgraph;
|
||||
|
|
@ -460,7 +461,8 @@ std::unique_ptr<MNN::NetT> optimizeNet(std::unique_ptr<MNN::NetT>& originNet, bo
|
|||
auto ctx = OptimizeContext{subgraphs, forTraining, originNet->sourceType};
|
||||
Global<OptimizeContext>::Reset(&ctx);
|
||||
|
||||
std::unique_ptr<MNN::NetT> net = optimizeNetImpl(originNet, {}, forTraining);
|
||||
std::unordered_map<std::string, VARP> empty;
|
||||
std::unique_ptr<MNN::NetT> net = optimizeNetImpl(originNet, empty, forTraining);
|
||||
fuseConstIntoSubgraph(net.get(), ctx.completed_subgraphs);
|
||||
for (auto* subgraph : ctx.completed_subgraphs) {
|
||||
net->subgraphs.emplace_back(subgraph);
|
||||
|
|
|
|||
|
|
@ -7,6 +7,13 @@
|
|||
//
|
||||
|
||||
#include <MNN/expr/Optimizer.hpp>
|
||||
#include <stdexcept>
|
||||
|
||||
#define MNN_THROW_CHECK(success, log) \
|
||||
if(!(success)){ \
|
||||
MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
||||
throw std::runtime_error("Error for onnx convert");\
|
||||
}
|
||||
|
||||
namespace MNN {
|
||||
namespace Express {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,149 @@
|
|||
//
|
||||
// FuseTfPrelu.cpp
|
||||
// MNNConverter
|
||||
//
|
||||
// Created by MNN on 2020/11/13.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "../TemplateMerge.hpp"
|
||||
#include "MNN/expr/MathOp.hpp"
|
||||
#include "MNN/expr/NeuralNetWorkOp.hpp"
|
||||
#include "MNN_generated.h"
|
||||
#include "../../common/Global.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace Express {
|
||||
|
||||
enum PreluCases {
|
||||
None,
|
||||
OCRCustom,
|
||||
};
|
||||
|
||||
static auto gRegister = []() {
|
||||
auto match = [](EXPRP expr) {
|
||||
PreluCases preluCase = PreluCases::None;
|
||||
|
||||
// ocr custom case of prelu
|
||||
{
|
||||
if (nullptr == expr->get()) {
|
||||
return false;
|
||||
}
|
||||
if (expr->get()->type() != OpType_Eltwise) {
|
||||
return false;
|
||||
}
|
||||
if (expr->get()->main_as_Eltwise()->type() != EltwiseType_SUM) {
|
||||
return false;
|
||||
}
|
||||
if (expr->inputs().size() != 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto leftReluVar = expr->inputs().at(0);
|
||||
auto leftReluExpr = leftReluVar->expr().first;
|
||||
if (leftReluExpr->get() == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (leftReluExpr->get()->type() != OpType_ReLU) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto rightBinaryVar = expr->inputs().at(1);
|
||||
auto rightBinaryExpr = rightBinaryVar->expr().first;
|
||||
if (rightBinaryExpr->get() == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (rightBinaryExpr->get()->type() != OpType_BinaryOp) {
|
||||
return false;
|
||||
}
|
||||
if (rightBinaryExpr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_MUL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto rightBinaryConstVar = rightBinaryExpr->inputs().at(0);
|
||||
auto rightBinaryConstExpr = rightBinaryConstVar->expr().first;
|
||||
if (rightBinaryConstExpr->get() != nullptr) {
|
||||
return false;
|
||||
}
|
||||
auto rightBinaryReluVar = rightBinaryExpr->inputs().at(1);
|
||||
auto rightBinaryReluExpr = rightBinaryReluVar->expr().first;
|
||||
if (rightBinaryReluExpr->get() == nullptr) {
|
||||
return false;
|
||||
}
|
||||
bool cond = ((rightBinaryConstExpr->inputType() == VARP::CONSTANT) && (rightBinaryReluExpr->get()->type() == OpType_ReLU));
|
||||
if (!cond) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto unaryVar = rightBinaryReluExpr->inputs().at(0);
|
||||
auto unaryExpr = unaryVar->expr().first;
|
||||
if (unaryExpr->get() == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (unaryExpr->get()->type() != OpType_UnaryOp) {
|
||||
return false;
|
||||
}
|
||||
if (unaryExpr->get()->main_as_UnaryOp()->opType() != UnaryOpOperation_NEG) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto leftSourceVar = leftReluExpr->inputs().at(0);
|
||||
auto rightSourceVar = unaryExpr->inputs().at(0);
|
||||
if (leftSourceVar->expr() != rightSourceVar->expr()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
preluCase = PreluCases::OCRCustom;
|
||||
}
|
||||
|
||||
Global<PreluCases>::Reset(&preluCase);
|
||||
|
||||
if (preluCase != PreluCases::None) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
auto transform = [](EXPRP expr) {
|
||||
auto preluCase = Global<PreluCases>::Get();
|
||||
|
||||
// ocr custom case of prelu
|
||||
if (*preluCase == PreluCases::OCRCustom) {
|
||||
auto leftReluVar = expr->inputs().at(0);
|
||||
auto leftReluExpr = leftReluVar->expr().first;
|
||||
auto sourceVar = leftReluExpr->inputs().at(0);
|
||||
auto rightBinaryVar = expr->inputs().at(1);
|
||||
auto rightBinaryExpr = rightBinaryVar->expr().first;
|
||||
auto rightBinaryConstVar = rightBinaryExpr->inputs().at(0);
|
||||
|
||||
std::unique_ptr<MNN::OpT> PreluOp(new OpT);
|
||||
PreluOp->type = OpType_PReLU;
|
||||
PreluOp->name = expr->name();
|
||||
PreluOp->main.type = OpParameter_PRelu;
|
||||
PreluOp->main.value = new PReluT;
|
||||
auto PreluParameter = PreluOp->main.AsPRelu();
|
||||
{
|
||||
auto PreluPoint = _Negative(rightBinaryConstVar);
|
||||
auto PreluPointInfo = PreluPoint->getInfo();
|
||||
auto PreluPointPtr = PreluPoint->readMap<float>();
|
||||
PreluParameter->slope.resize(PreluPointInfo->size);
|
||||
::memcpy(PreluParameter->slope.data(), PreluPointPtr, PreluPointInfo->size * sizeof(float));
|
||||
PreluParameter->slopeCount = PreluPointInfo->size;
|
||||
}
|
||||
auto newVar = Variable::create(Expr::create(PreluOp.get(), {sourceVar}, expr->outputSize()));
|
||||
newVar->setName(expr->outputName(0));
|
||||
Expr::replace(expr, newVar->expr().first);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
TemplateMerge::getInstance("Merge").insertTemplate("FuseTfPrelu", match, transform);
|
||||
return true;
|
||||
}();
|
||||
|
||||
}
|
||||
} // namespace MNN
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue