2024-02-29 16:21:40 +08:00
|
|
|
//
|
|
|
|
// ShapeAttention.cpp
|
|
|
|
// MNN
|
|
|
|
//
|
|
|
|
// Created by MNN on 2023/09/10.
|
|
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
|
|
//
|
|
|
|
#include "shape/SizeComputer.hpp"
|
|
|
|
#include "core/Macro.h"
|
|
|
|
#include "core/TensorUtils.hpp"
|
|
|
|
|
2024-05-11 19:17:02 +08:00
|
|
|
|
2024-02-29 16:21:40 +08:00
|
|
|
namespace MNN {
|
2024-05-11 19:17:02 +08:00
|
|
|
#ifdef MNN_SUPPORT_TRANSFORMER_FUSE
|
2024-02-29 16:21:40 +08:00
|
|
|
|
|
|
|
class FmhaV2SizeComputer : public SizeComputer {
|
|
|
|
virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) const override {
|
|
|
|
auto input0 = inputs[0], output0 = outputs[0];
|
|
|
|
MNN_ASSERT(inputs.size() == 1);
|
|
|
|
MNN_ASSERT(input0->buffer().dimensions == 3);
|
2024-05-11 19:17:02 +08:00
|
|
|
|
2024-02-29 16:21:40 +08:00
|
|
|
output0->buffer().dim[0].extent = input0->buffer().dim[0].extent;
|
|
|
|
output0->buffer().dim[1].extent = input0->buffer().dim[1].extent;
|
|
|
|
output0->buffer().dim[2].extent = input0->buffer().dim[2].extent/3;
|
|
|
|
output0->buffer().dimensions = 3;
|
|
|
|
//MNN_PRINT("fmhaV2 shape:%d %d, %d %d %d %d %d\n", input0->buffer().dimensions, output0->buffer().dimensions, input0->buffer().dim[0].extent, input0->buffer().dim[1].extent, input0->buffer().dim[2].extent, input0->buffer().dim[3].extent, input0->buffer().dim[4].extent);
|
|
|
|
//MNN_ASSERT(input0->buffer().dim[3].extent == 3);
|
|
|
|
output0->buffer().type = input0->buffer().type;
|
|
|
|
TensorUtils::getDescribe(output0)->dimensionFormat = TensorUtils::getDescribe(input0)->dimensionFormat;
|
|
|
|
//printf("fmhaV2 shape:%d %d, %d %d %d\n", input0->buffer().dimensions, output0->buffer().dimensions, input0->buffer().dim[0].extent, input0->buffer().dim[1].extent, input0->buffer().dim[2].extent);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class FmhcaSizeComputer : public SizeComputer {
|
|
|
|
virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) const override {
|
|
|
|
MNN_ASSERT(inputs.size() == 2);
|
|
|
|
MNN_ASSERT(outputs.size() == 1);
|
|
|
|
auto input0 = inputs[0];
|
|
|
|
auto input1 = inputs[1];
|
|
|
|
auto output0 = outputs[0];
|
|
|
|
MNN_ASSERT(input0->buffer().dimensions == 3);
|
|
|
|
MNN_ASSERT(input1->buffer().dimensions == 3);
|
2024-05-11 19:17:02 +08:00
|
|
|
|
2024-02-29 16:21:40 +08:00
|
|
|
output0->buffer().dim[0].extent = input0->buffer().dim[0].extent;
|
|
|
|
output0->buffer().dim[1].extent = input0->buffer().dim[1].extent;
|
|
|
|
output0->buffer().dim[2].extent = input0->buffer().dim[2].extent;
|
|
|
|
output0->buffer().dimensions = 3;
|
|
|
|
//MNN_ASSERT(input1->buffer().dim[0].extent == input0->buffer().dim[0].extent);
|
|
|
|
//MNN_ASSERT(input1->buffer().dim[2].extent == input0->buffer().dim[2].extent);
|
|
|
|
//MNN_ASSERT(input1->buffer().dim[4].extent == input0->buffer().dim[3].extent);
|
|
|
|
output0->buffer().type = input0->buffer().type;
|
|
|
|
TensorUtils::getDescribe(output0)->dimensionFormat = TensorUtils::getDescribe(input0)->dimensionFormat;
|
|
|
|
//printf("fmhca shape:%d %d %d, %d %d %d\n", input0->buffer().dimensions, input1->buffer().dimensions, output0->buffer().dimensions, input0->buffer().dim[0].extent, input0->buffer().dim[1].extent, input0->buffer().dim[2].extent);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
2024-05-11 19:17:02 +08:00
|
|
|
|
|
|
|
class AttentionSizeComputer : public SizeComputer {
|
|
|
|
virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
|
|
|
|
const std::vector<Tensor*>& outputs) const override {
|
|
|
|
auto input = inputs[0], output = outputs[0];
|
|
|
|
MNN_ASSERT(input->buffer().dimensions == 4);
|
|
|
|
output->buffer().dim[0].extent = input->buffer().dim[0].extent;
|
|
|
|
output->buffer().dim[1].extent = input->buffer().dim[1].extent;
|
|
|
|
output->buffer().dim[2].extent = input->buffer().dim[2].extent * input->buffer().dim[3].extent;
|
|
|
|
output->buffer().dimensions = 3;
|
|
|
|
output->buffer().type = input->buffer().type;
|
|
|
|
TensorUtils::getDescribe(output)->dimensionFormat = TensorUtils::getDescribe(input)->dimensionFormat;
|
|
|
|
return true;
|
|
|
|
}
|
2025-01-22 14:47:50 +08:00
|
|
|
virtual float onComputeFlops(const MNN::Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) const override {
|
2025-03-27 11:19:34 +08:00
|
|
|
auto seqLen = static_cast<float>(outputs[0]->length(1));
|
|
|
|
auto headDim = static_cast<float>(outputs[0]->length(2));
|
2025-01-22 14:47:50 +08:00
|
|
|
float flops = 0.f;
|
|
|
|
// qk + qkv
|
|
|
|
flops += (2 * seqLen * headDim * seqLen);
|
|
|
|
// softmax
|
|
|
|
flops += (seqLen * seqLen);
|
2025-03-27 11:19:34 +08:00
|
|
|
return flops / FLOPS_M;
|
2025-01-22 14:47:50 +08:00
|
|
|
}
|
2024-05-11 19:17:02 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2024-02-29 16:21:40 +08:00
|
|
|
REGISTER_SHAPE_INPUTS_TRANSFORMER_FUSE(FmhaV2SizeComputer, OpType_FmhaV2);
|
|
|
|
REGISTER_SHAPE_INPUTS_TRANSFORMER_FUSE(FmhcaSizeComputer, OpType_Fmhca);
|
2024-05-11 19:17:02 +08:00
|
|
|
REGISTER_SHAPE_INPUTS_TRANSFORMER_FUSE(AttentionSizeComputer, OpType_Attention);
|
|
|
|
#endif
|
2024-02-29 16:21:40 +08:00
|
|
|
|
|
|
|
} // namespace MNN
|
2024-05-11 19:17:02 +08:00
|
|
|
|