mirror of https://github.com/ollama/ollama.git
117 lines
3.4 KiB
Go
117 lines
3.4 KiB
Go
package convert
|
|
|
|
import (
|
|
"cmp"
|
|
"encoding/json"
|
|
"io/fs"
|
|
"slices"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type qwen3VLModel struct {
|
|
qwen3Model `json:"text_config"`
|
|
|
|
VisionModel struct {
|
|
Depth uint32 `json:"depth"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
NumHeads uint32 `json:"num_heads"`
|
|
InChannels uint32 `json:"in_channels"`
|
|
PatchSize uint32 `json:"patch_size"`
|
|
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
|
WindowSize uint32 `json:"window_size"`
|
|
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
|
RopeTheta float32 `json:"rope_theta"`
|
|
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
|
DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
|
|
|
|
Size struct {
|
|
ShortestEdge uint32 `json:"shortest_edge"`
|
|
LongestEdge uint32 `json:"longest_edge"`
|
|
} `json:"size"`
|
|
|
|
ImageMean []float32 `json:"image_mean"`
|
|
ImageStd []float32 `json:"image_std"`
|
|
} `json:"vision_config"`
|
|
}
|
|
|
|
func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
|
|
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return json.Unmarshal(bts, &m.VisionModel)
|
|
}
|
|
|
|
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
|
|
kv := m.qwen3Model.KV(t)
|
|
|
|
arch := "qwen3vl"
|
|
if m.NumExperts > 0 {
|
|
arch += "moe"
|
|
}
|
|
// override architecture
|
|
kv["general.architecture"] = arch
|
|
|
|
kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
|
|
kv["vision.embedding_length"] = m.VisionModel.HiddenSize
|
|
kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
|
|
kv["vision.num_channels"] = m.VisionModel.InChannels
|
|
kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
|
|
kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
|
|
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
|
|
kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
|
|
kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
|
|
kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
|
|
|
|
kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
|
|
kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
|
|
|
|
kv["vision.image_mean"] = m.VisionModel.ImageMean
|
|
kv["vision.image_std"] = m.VisionModel.ImageStd
|
|
|
|
return kv
|
|
}
|
|
|
|
func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
var rest []Tensor
|
|
var out []*ggml.Tensor
|
|
for _, t := range ts {
|
|
switch {
|
|
case strings.Contains(t.Name(), "attn_qkv"):
|
|
out = append(out, slices.Collect(splitDim(t, 0,
|
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
|
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
|
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
|
|
))...)
|
|
case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
|
|
shape := t.Shape()
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: append([]uint64{shape[0] * shape[1]}, shape[2:]...),
|
|
WriterTo: t,
|
|
})
|
|
default:
|
|
rest = append(rest, t)
|
|
}
|
|
}
|
|
|
|
return append(m.qwen3Model.Tensors(rest), out...)
|
|
}
|
|
|
|
func (m *qwen3VLModel) Replacements() []string {
|
|
return append(
|
|
m.qwen3Model.Replacements(),
|
|
"model.language_", "",
|
|
"model.visual", "v",
|
|
"patch_embed.proj", "patch_embed",
|
|
"blocks", "blk",
|
|
"attn.qkv", "attn_qkv",
|
|
"attn.proj", "attn_out",
|
|
"deepstack_merger_list", "deepstack_merger",
|
|
)
|
|
}
|