mirror of https://github.com/ollama/ollama.git
193 lines
6.6 KiB
Go
193 lines
6.6 KiB
Go
package deepseekocr
|
|
|
|
import (
|
|
"math"
|
|
"slices"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/kvcache"
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/ml/nn"
|
|
"github.com/ollama/ollama/model"
|
|
"github.com/ollama/ollama/model/input"
|
|
)
|
|
|
|
type Model struct {
|
|
model.Base
|
|
model.TextProcessor
|
|
|
|
Sam *samModel `gguf:"s"`
|
|
Vision *visionModel `gguf:"v"`
|
|
Text *textModel
|
|
|
|
ImageNewline ml.Tensor `gguf:"mm.image_newline"`
|
|
//nolint:misspell // this misspelling is upstream. fixing it breaks the model
|
|
ViewSeperator ml.Tensor `gguf:"mm.view_seperator"`
|
|
|
|
Projector *nn.Linear `gguf:"mm.layers"`
|
|
}
|
|
|
|
func (m *Model) EncodeMultimodal(ctx ml.Context, bts []byte) ([]input.Multimodal, error) {
|
|
patches, original, crop, err := ProcessImage(ctx, bts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var outputs []ml.Tensor
|
|
if true { // TODO: local features if sum(patches) != 0
|
|
samOutputs := m.Sam.Forward(ctx, patches)
|
|
visionOutputs := m.Vision.Forward(ctx, patches, samOutputs)
|
|
|
|
samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
|
|
visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
|
|
localOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
|
|
localOutputs = m.Projector.Forward(ctx, localOutputs)
|
|
|
|
hw := int(math.Sqrt(float64(localOutputs.Dim(1))))
|
|
localOutputs = localOutputs.Reshape(ctx, -1, hw, crop[0], crop[1])
|
|
localOutputs = localOutputs.Permute(ctx, 0, 2, 1, 3)
|
|
localOutputs = localOutputs.Contiguous(ctx, -1, crop[0]*hw, crop[1]*hw)
|
|
localOutputs = localOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, localOutputs.Dim(2)), 1)
|
|
localOutputs = localOutputs.Reshape(ctx, localOutputs.Dim(0), -1)
|
|
|
|
outputs = append(outputs, localOutputs)
|
|
}
|
|
|
|
samOutputs := m.Sam.Forward(ctx, original)
|
|
visionOutputs := m.Vision.Forward(ctx, original, samOutputs)
|
|
|
|
samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
|
|
visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
|
|
globalOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
|
|
globalOutputs = m.Projector.Forward(ctx, globalOutputs)
|
|
|
|
hw := int(math.Sqrt(float64(globalOutputs.Dim(1))))
|
|
globalOutputs = globalOutputs.Reshape(ctx, -1, hw, hw)
|
|
globalOutputs = globalOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, globalOutputs.Dim(2)), 1)
|
|
globalOutputs = globalOutputs.Reshape(ctx, globalOutputs.Dim(0), -1)
|
|
|
|
outputs = append(outputs, globalOutputs, m.ViewSeperator)
|
|
return []input.Multimodal{
|
|
{Tensor: outputs[0].Stack(ctx, 1, outputs[1:]...)},
|
|
}, nil
|
|
}
|
|
|
|
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
|
outputs := make([]*input.Input, 0, len(inputs))
|
|
for i := range inputs {
|
|
if inputs[i].Multimodal == nil {
|
|
outputs = append(outputs, inputs[i])
|
|
continue
|
|
}
|
|
|
|
t := inputs[i].Multimodal[0].Tensor
|
|
outputs = append(outputs, &input.Input{
|
|
Token: 128815,
|
|
Multimodal: inputs[i].Multimodal,
|
|
MultimodalHash: inputs[i].MultimodalHash,
|
|
SameBatch: t.Dim(1) - 1,
|
|
})
|
|
|
|
outputs = slices.Grow(outputs, t.Dim(1)-1)
|
|
outputs = append(outputs, slices.Repeat([]*input.Input{{Token: 128815}}, t.Dim(1)-1)...)
|
|
}
|
|
return outputs, nil
|
|
}
|
|
|
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
|
inputsEmbeds := m.Text.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
|
positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
|
|
|
|
for _, mm := range batch.Multimodal {
|
|
t := mm.Multimodal[0].Tensor
|
|
ctx.Forward(t.Copy(ctx, inputsEmbeds.View(ctx, mm.Index*inputsEmbeds.Stride(1), t.Dim(0)*t.Dim(1))))
|
|
}
|
|
|
|
hiddenStates := inputsEmbeds
|
|
for i, block := range m.Text.Blocks {
|
|
if m.Cache != nil {
|
|
m.Cache.SetLayer(i)
|
|
}
|
|
|
|
var outputs ml.Tensor
|
|
if i == len(m.Text.Blocks)-1 {
|
|
outputs = batch.Outputs
|
|
}
|
|
|
|
hiddenStates = block.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Text.Options)
|
|
}
|
|
|
|
hiddenStates = m.Text.OutputNorm.Forward(ctx, hiddenStates, m.Text.Options.eps)
|
|
return m.Text.Output.Forward(ctx, hiddenStates), nil
|
|
}
|
|
|
|
func init() {
|
|
model.Register("deepseekocr", func(c fs.Config) (model.Model, error) {
|
|
textBlocks := make([]textBlock, c.Uint("block_count"))
|
|
leadingDenseBlockCount := int(c.Uint("leading_dense_block_count", 1))
|
|
for i := range textBlocks {
|
|
if i >= leadingDenseBlockCount {
|
|
textBlocks[i].FeedForward = &textMoe{}
|
|
} else {
|
|
textBlocks[i].FeedForward = &textMLP{}
|
|
}
|
|
}
|
|
|
|
m := Model{
|
|
TextProcessor: model.NewBytePairEncoding(
|
|
&model.Vocabulary{
|
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
|
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
|
EOS: append(
|
|
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
|
),
|
|
},
|
|
// Split regex into multiple parts (according to DeepSeek3's regex)
|
|
"\\p{N}{1,3}",
|
|
`[一-龥-ゟ゠-ヿ]+`,
|
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
|
),
|
|
Text: &textModel{
|
|
Blocks: textBlocks,
|
|
Options: textOptions{
|
|
hiddenSize: int(c.Uint("embedding_length")),
|
|
numHeads: int(c.Uint("attention.head_count")),
|
|
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
|
numExperts: int(c.Uint("expert_count")),
|
|
numExpertsUsed: int(c.Uint("expert_used_count")),
|
|
ropeBase: c.Float("rope.freq_base", 10_000),
|
|
ropeScale: c.Float("rope.scaling.factor", 1.0),
|
|
eps: c.Float("attention.layer_norm_rms_epsilon", 1e-6),
|
|
},
|
|
},
|
|
Vision: &visionModel{
|
|
Blocks: make([]visionBlock, c.Uint("vision.block_count")),
|
|
Options: visionOptions{
|
|
hiddenSize: int(c.Uint("vision.embedding_length")),
|
|
numHeads: int(c.Uint("vision.head_count")),
|
|
imageSize: int(c.Uint("vision.image_size", 224)),
|
|
patchSize: int(c.Uint("vision.patch_size", 14)),
|
|
eps: c.Float("vision.attention.layer_norm_epsilon", 1e-5),
|
|
},
|
|
},
|
|
Sam: &samModel{
|
|
Blocks: make([]samBlock, c.Uint("sam.block_count")),
|
|
Options: samOptions{
|
|
hiddenSize: int(c.Uint("sam.embedding_length")),
|
|
numHeads: int(c.Uint("sam.head_count")),
|
|
eps: c.Float("sam.attention.layer_norm_epsilon", 1e-6),
|
|
globalAttentionLayers: c.Ints("sam.global_attention_indexes"),
|
|
},
|
|
},
|
|
}
|
|
|
|
m.Cache = kvcache.NewCausalCache(m.Text.Shift)
|
|
return &m, nil
|
|
})
|
|
}
|