mirror of https://github.com/ollama/ollama.git
179 lines
4.9 KiB
Go
179 lines
4.9 KiB
Go
package convert
|
|
|
|
import (
|
|
"bytes"
|
|
"cmp"
|
|
"encoding/binary"
|
|
"io"
|
|
"slices"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
"github.com/pdevine/tensor"
|
|
"github.com/pdevine/tensor/native"
|
|
)
|
|
|
|
type gptossModel struct {
|
|
ModelParameters
|
|
HiddenLayers uint32 `json:"num_hidden_layers"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
IntermediateSize uint32 `json:"intermediate_size"`
|
|
AttentionHeads uint32 `json:"num_attention_heads"`
|
|
KeyValueHeads uint32 `json:"num_key_value_heads"`
|
|
HeadDim uint32 `json:"head_dim"`
|
|
Experts uint32 `json:"num_experts"`
|
|
ExpertsPerToken uint32 `json:"experts_per_token"`
|
|
RMSNormEpsilon float32 `json:"rms_norm_eps"`
|
|
InitialContextLength uint32 `json:"initial_context_length"`
|
|
RopeTheta float32 `json:"rope_theta"`
|
|
RopeScalingFactor float32 `json:"rope_scaling_factor"`
|
|
SlidingWindow uint32 `json:"sliding_window"`
|
|
}
|
|
|
|
var _ ModelConverter = (*gptossModel)(nil)
|
|
|
|
func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
|
|
kv := m.ModelParameters.KV(t)
|
|
kv["general.architecture"] = "gptoss"
|
|
kv["general.file_type"] = uint32(4)
|
|
kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
|
|
kv["gptoss.block_count"] = m.HiddenLayers
|
|
kv["gptoss.embedding_length"] = m.HiddenSize
|
|
kv["gptoss.feed_forward_length"] = m.IntermediateSize
|
|
kv["gptoss.expert_count"] = m.Experts
|
|
kv["gptoss.expert_used_count"] = m.ExpertsPerToken
|
|
kv["gptoss.attention.head_count"] = m.AttentionHeads
|
|
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
|
|
kv["gptoss.attention.key_length"] = m.HeadDim
|
|
kv["gptoss.attention.value_length"] = m.HeadDim
|
|
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
|
|
kv["gptoss.attention.sliding_window"] = m.SlidingWindow
|
|
kv["gptoss.rope.freq_base"] = m.RopeTheta
|
|
kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
|
|
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
|
|
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
|
|
kv["tokenizer.ggml.add_bos_token"] = false
|
|
kv["tokenizer.ggml.eos_token_id"] = uint32(199999) // <|endoftext|>
|
|
kv["tokenizer.ggml.eos_token_ids"] = []int32{
|
|
199999, /* <|endoftext|> */
|
|
200002, /* <|return|> */
|
|
200012, /* <|call|> */
|
|
}
|
|
kv["tokenizer.ggml.add_eos_token"] = false
|
|
return kv
|
|
}
|
|
|
|
func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
var out []*ggml.Tensor
|
|
mxfp4s := make(map[string]*mxfp4)
|
|
for _, t := range ts {
|
|
if strings.HasSuffix(t.Name(), ".blocks") || strings.HasSuffix(t.Name(), ".scales") {
|
|
dot := strings.LastIndex(t.Name(), ".")
|
|
name, suffix := t.Name()[:dot], t.Name()[dot+1:]
|
|
if _, ok := mxfp4s[name]; !ok {
|
|
mxfp4s[name] = &mxfp4{}
|
|
}
|
|
|
|
switch suffix {
|
|
case "blocks":
|
|
mxfp4s[name].blocks = t
|
|
case "scales":
|
|
mxfp4s[name].scales = t
|
|
}
|
|
} else {
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: t.Shape(),
|
|
WriterTo: t,
|
|
})
|
|
}
|
|
}
|
|
|
|
for name, mxfp4 := range mxfp4s {
|
|
dims := mxfp4.blocks.Shape()
|
|
out = append(out, &ggml.Tensor{
|
|
Name: name,
|
|
Kind: uint32(ggml.TensorTypeMXFP4),
|
|
Shape: []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
|
|
WriterTo: mxfp4,
|
|
})
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func (m *gptossModel) Replacements() []string {
|
|
return []string{
|
|
// noop replacements so other replacements will not be applied
|
|
".blocks", ".blocks",
|
|
".scales", ".scales",
|
|
// real replacements
|
|
"block", "blk",
|
|
"attn.norm", "attn_norm",
|
|
"attn.qkv", "attn_qkv",
|
|
"attn.sinks", "attn_sinks",
|
|
"attn.out", "attn_out",
|
|
"mlp.norm", "ffn_norm",
|
|
"mlp.gate", "ffn_gate_inp",
|
|
"mlp.mlp1_", "ffn_gate_up_exps.",
|
|
"mlp.mlp2_", "ffn_down_exps.",
|
|
"embedding", "token_embd",
|
|
"norm", "output_norm",
|
|
"unembedding", "output",
|
|
"scale", "weight",
|
|
}
|
|
}
|
|
|
|
type mxfp4 struct {
|
|
blocks, scales Tensor
|
|
}
|
|
|
|
func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
|
|
var b bytes.Buffer
|
|
if _, err := m.blocks.WriteTo(&b); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
blocksDims := make([]int, len(m.blocks.Shape()))
|
|
for i, d := range m.blocks.Shape() {
|
|
blocksDims[i] = int(d)
|
|
}
|
|
|
|
var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))
|
|
|
|
var s bytes.Buffer
|
|
if _, err := m.scales.WriteTo(&s); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
scalesDims := slices.Repeat([]int{1}, len(m.blocks.Shape()))
|
|
for i, d := range m.scales.Shape() {
|
|
scalesDims[i] = int(d)
|
|
}
|
|
|
|
var scales tensor.Tensor = tensor.New(tensor.WithShape(scalesDims...), tensor.WithBacking(s.Bytes()))
|
|
|
|
out, err := tensor.Concat(3, scales, blocks)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
out = tensor.Materialize(out)
|
|
|
|
if err := out.Reshape(out.Shape().TotalSize()); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
u8s, err := native.VectorU8(out.(*tensor.Dense))
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
if err := binary.Write(w, binary.LittleEndian, u8s); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return 0, nil
|
|
}
|