| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | package llm | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							|  |  |  | 	"encoding/binary" | 
					
						
							|  |  |  | 	"errors" | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	"fmt" | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	"io" | 
					
						
							| 
									
										
										
										
											2024-03-18 17:45:22 +08:00
										 |  |  | 	"strings" | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-13 01:01:20 +08:00
										 |  |  | type GGML struct { | 
					
						
							|  |  |  | 	container | 
					
						
							|  |  |  | 	model | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-18 02:37:27 +08:00
										 |  |  | type model interface { | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	KV() KV | 
					
						
							| 
									
										
										
										
											2024-04-04 06:00:31 +08:00
										 |  |  | 	Tensors() Tensors | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | type KV map[string]any | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | func (kv KV) u64(key string) uint64 { | 
					
						
							|  |  |  | 	switch v := kv[key].(type) { | 
					
						
							|  |  |  | 	case uint64: | 
					
						
							|  |  |  | 		return v | 
					
						
							|  |  |  | 	case uint32: | 
					
						
							|  |  |  | 		return uint64(v) | 
					
						
							|  |  |  | 	case float64: | 
					
						
							|  |  |  | 		return uint64(v) | 
					
						
							|  |  |  | 	default: | 
					
						
							|  |  |  | 		return 0 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) Architecture() string { | 
					
						
							|  |  |  | 	if s, ok := kv["general.architecture"].(string); ok { | 
					
						
							|  |  |  | 		return s | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return "unknown" | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) ParameterCount() uint64 { | 
					
						
							|  |  |  | 	return kv.u64("general.parameter_count") | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-08 08:44:03 +08:00
										 |  |  | func (kv KV) FileType() fileType { | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	if u64 := kv.u64("general.file_type"); u64 > 0 { | 
					
						
							| 
									
										
										
										
											2024-05-08 08:44:03 +08:00
										 |  |  | 		return fileType(uint32(u64)) | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-08 08:44:03 +08:00
										 |  |  | 	return fileTypeUnknown | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) BlockCount() uint64 { | 
					
						
							|  |  |  | 	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture())) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) HeadCount() uint64 { | 
					
						
							|  |  |  | 	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture())) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) HeadCountKV() uint64 { | 
					
						
							| 
									
										
										
										
											2024-04-03 07:37:59 +08:00
										 |  |  | 	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 { | 
					
						
							|  |  |  | 		return headCountKV | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return 1 | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) GQA() uint64 { | 
					
						
							| 
									
										
										
										
											2024-04-03 07:37:59 +08:00
										 |  |  | 	return kv.HeadCount() / kv.HeadCountKV() | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) EmbeddingLength() uint64 { | 
					
						
							|  |  |  | 	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture())) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (kv KV) ContextLength() uint64 { | 
					
						
							|  |  |  | 	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture())) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-04 06:00:31 +08:00
										 |  |  | type Tensors []*Tensor | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (ts Tensors) Layers() map[string]Layer { | 
					
						
							|  |  |  | 	layers := make(map[string]Layer) | 
					
						
							|  |  |  | 	for _, t := range ts { | 
					
						
							|  |  |  | 		parts := strings.Split(t.Name, ".") | 
					
						
							|  |  |  | 		if parts[0] == "blk" { | 
					
						
							| 
									
										
										
										
											2024-04-18 01:29:12 +08:00
										 |  |  | 			// join first and second part, e.g. blk.%d
 | 
					
						
							|  |  |  | 			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...) | 
					
						
							| 
									
										
										
										
											2024-04-04 06:00:31 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if _, ok := layers[parts[0]]; !ok { | 
					
						
							|  |  |  | 			layers[parts[0]] = make(Layer) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		layers[parts[0]][strings.Join(parts[1:], ".")] = t | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return layers | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type Layer map[string]*Tensor | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (l Layer) size() (size uint64) { | 
					
						
							|  |  |  | 	for _, t := range l { | 
					
						
							| 
									
										
										
										
											2024-05-21 00:47:01 +08:00
										 |  |  | 		size += t.Size() | 
					
						
							| 
									
										
										
										
											2024-04-04 06:00:31 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return size | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | type Tensor struct { | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	Name   string `json:"name"` | 
					
						
							|  |  |  | 	Kind   uint32 `json:"kind"` | 
					
						
							|  |  |  | 	Offset uint64 `json:"-"` | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// Shape is the number of elements in each dimension
 | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	Shape []uint64 `json:"shape"` | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	io.WriterTo `json:"-"` | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (t Tensor) blockSize() uint64 { | 
					
						
							|  |  |  | 	switch { | 
					
						
							|  |  |  | 	case t.Kind < 2: | 
					
						
							|  |  |  | 		return 1 | 
					
						
							|  |  |  | 	case t.Kind < 10: | 
					
						
							|  |  |  | 		return 32 | 
					
						
							|  |  |  | 	default: | 
					
						
							|  |  |  | 		return 256 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (t Tensor) typeSize() uint64 { | 
					
						
							|  |  |  | 	blockSize := t.blockSize() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	switch t.Kind { | 
					
						
							|  |  |  | 	case 0: // FP32
 | 
					
						
							|  |  |  | 		return 4 | 
					
						
							|  |  |  | 	case 1: // FP16
 | 
					
						
							|  |  |  | 		return 2 | 
					
						
							|  |  |  | 	case 2: // Q4_0
 | 
					
						
							|  |  |  | 		return 2 + blockSize/2 | 
					
						
							|  |  |  | 	case 3: // Q4_1
 | 
					
						
							|  |  |  | 		return 2 + 2 + blockSize/2 | 
					
						
							|  |  |  | 	case 6: // Q5_0
 | 
					
						
							|  |  |  | 		return 2 + 4 + blockSize/2 | 
					
						
							|  |  |  | 	case 7: // Q5_1
 | 
					
						
							|  |  |  | 		return 2 + 2 + 4 + blockSize/2 | 
					
						
							|  |  |  | 	case 8: // Q8_0
 | 
					
						
							|  |  |  | 		return 2 + blockSize | 
					
						
							|  |  |  | 	case 9: // Q8_1
 | 
					
						
							|  |  |  | 		return 4 + 4 + blockSize | 
					
						
							|  |  |  | 	case 10: // Q2_K
 | 
					
						
							|  |  |  | 		return blockSize/16 + blockSize/4 + 2 + 2 | 
					
						
							|  |  |  | 	case 11: // Q3_K
 | 
					
						
							|  |  |  | 		return blockSize/8 + blockSize/4 + 12 + 2 | 
					
						
							|  |  |  | 	case 12: // Q4_K
 | 
					
						
							|  |  |  | 		return 2 + 2 + 12 + blockSize/2 | 
					
						
							|  |  |  | 	case 13: // Q5_K
 | 
					
						
							|  |  |  | 		return 2 + 2 + 12 + blockSize/8 + blockSize/2 | 
					
						
							|  |  |  | 	case 14: // Q6_K
 | 
					
						
							|  |  |  | 		return blockSize/2 + blockSize/4 + blockSize/16 + 2 | 
					
						
							|  |  |  | 	case 15: // Q8_K
 | 
					
						
							|  |  |  | 		return 2 + blockSize + 2*blockSize/16 | 
					
						
							|  |  |  | 	case 16: // IQ2_XXS
 | 
					
						
							|  |  |  | 		return 2 + 2*blockSize/8 | 
					
						
							|  |  |  | 	case 17: // IQ2_XS
 | 
					
						
							|  |  |  | 		return 2 + 2*blockSize/8 + blockSize/32 | 
					
						
							|  |  |  | 	case 18: // IQ3_XXS
 | 
					
						
							|  |  |  | 		return 2 + 3*blockSize/8 | 
					
						
							|  |  |  | 	default: | 
					
						
							|  |  |  | 		return 0 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (t Tensor) parameters() uint64 { | 
					
						
							|  |  |  | 	var count uint64 = 1 | 
					
						
							|  |  |  | 	for _, n := range t.Shape { | 
					
						
							|  |  |  | 		count *= n | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return count | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-21 00:47:01 +08:00
										 |  |  | func (t Tensor) Size() uint64 { | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | 	return t.parameters() * t.typeSize() / t.blockSize() | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | type container interface { | 
					
						
							|  |  |  | 	Name() string | 
					
						
							| 
									
										
										
										
											2024-03-10 04:28:36 +08:00
										 |  |  | 	Decode(io.ReadSeeker) (model, error) | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | const ( | 
					
						
							| 
									
										
										
										
											2023-09-08 01:55:37 +08:00
										 |  |  | 	// Magic constant for `ggml` files (unversioned).
 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	FILE_MAGIC_GGML = 0x67676d6c | 
					
						
							| 
									
										
										
										
											2023-09-08 01:55:37 +08:00
										 |  |  | 	// Magic constant for `ggml` files (versioned, ggmf).
 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	FILE_MAGIC_GGMF = 0x67676d66 | 
					
						
							| 
									
										
										
										
											2023-09-08 01:55:37 +08:00
										 |  |  | 	// Magic constant for `ggml` files (versioned, ggjt).
 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	FILE_MAGIC_GGJT = 0x67676a74 | 
					
						
							| 
									
										
										
										
											2023-09-08 01:55:37 +08:00
										 |  |  | 	// Magic constant for `ggla` files (LoRA adapter).
 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	FILE_MAGIC_GGLA = 0x67676C61 | 
					
						
							| 
									
										
										
										
											2023-09-08 01:55:37 +08:00
										 |  |  | 	// Magic constant for `gguf` files (versioned, gguf)
 | 
					
						
							| 
									
										
										
										
											2023-10-24 00:33:13 +08:00
										 |  |  | 	FILE_MAGIC_GGUF_LE = 0x46554747 | 
					
						
							|  |  |  | 	FILE_MAGIC_GGUF_BE = 0x47475546 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-25 02:58:09 +08:00
										 |  |  | var ErrUnsupportedFormat = errors.New("unsupported model format") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-13 04:55:12 +08:00
										 |  |  | func DetectGGMLType(b []byte) string { | 
					
						
							|  |  |  | 	switch binary.LittleEndian.Uint32(b[:4]) { | 
					
						
							|  |  |  | 	case FILE_MAGIC_GGML: | 
					
						
							|  |  |  | 		return "ggml" | 
					
						
							|  |  |  | 	case FILE_MAGIC_GGMF: | 
					
						
							|  |  |  | 		return "ggmf" | 
					
						
							|  |  |  | 	case FILE_MAGIC_GGJT: | 
					
						
							|  |  |  | 		return "ggjt" | 
					
						
							|  |  |  | 	case FILE_MAGIC_GGLA: | 
					
						
							|  |  |  | 		return "ggla" | 
					
						
							|  |  |  | 	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE: | 
					
						
							|  |  |  | 		return "gguf" | 
					
						
							|  |  |  | 	default: | 
					
						
							|  |  |  | 		return "" | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { | 
					
						
							| 
									
										
										
										
											2023-11-25 03:57:20 +08:00
										 |  |  | 	var magic uint32 | 
					
						
							| 
									
										
										
										
											2024-03-10 04:28:36 +08:00
										 |  |  | 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 		return nil, 0, err | 
					
						
							| 
									
										
										
										
											2023-11-25 03:57:20 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var c container | 
					
						
							|  |  |  | 	switch magic { | 
					
						
							| 
									
										
										
										
											2023-11-25 02:58:09 +08:00
										 |  |  | 	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 		return nil, 0, ErrUnsupportedFormat | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	case FILE_MAGIC_GGLA: | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | 		c = &containerGGLA{} | 
					
						
							| 
									
										
										
										
											2023-10-24 00:33:13 +08:00
										 |  |  | 	case FILE_MAGIC_GGUF_LE: | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | 		c = &containerGGUF{ByteOrder: binary.LittleEndian} | 
					
						
							| 
									
										
										
										
											2023-10-24 00:33:13 +08:00
										 |  |  | 	case FILE_MAGIC_GGUF_BE: | 
					
						
							| 
									
										
										
										
											2024-03-29 09:54:01 +08:00
										 |  |  | 		c = &containerGGUF{ByteOrder: binary.BigEndian} | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	default: | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 		return nil, 0, errors.New("invalid file magic") | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 04:28:36 +08:00
										 |  |  | 	model, err := c.Decode(rs) | 
					
						
							| 
									
										
										
										
											2024-03-09 07:38:53 +08:00
										 |  |  | 	if errors.Is(err, io.EOF) { | 
					
						
							|  |  |  | 		// noop
 | 
					
						
							|  |  |  | 	} else if err != nil { | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 		return nil, 0, err | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 04:28:36 +08:00
										 |  |  | 	offset, err := rs.Seek(0, io.SeekCurrent) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 		return nil, 0, err | 
					
						
							| 
									
										
										
										
											2024-03-10 04:28:36 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-22 04:33:56 +08:00
										 |  |  | 	// final model type
 | 
					
						
							| 
									
										
										
										
											2023-11-25 03:57:20 +08:00
										 |  |  | 	return &GGML{ | 
					
						
							|  |  |  | 		container: c, | 
					
						
							|  |  |  | 		model:     model, | 
					
						
							| 
									
										
										
										
											2024-03-14 02:03:56 +08:00
										 |  |  | 	}, offset, nil | 
					
						
							| 
									
										
										
										
											2023-11-25 03:57:20 +08:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2024-04-03 02:15:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-06 05:50:38 +08:00
										 |  |  | func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) { | 
					
						
							|  |  |  | 	embedding := llm.KV().EmbeddingLength() | 
					
						
							|  |  |  | 	heads := llm.KV().HeadCount() | 
					
						
							|  |  |  | 	headsKV := llm.KV().HeadCountKV() | 
					
						
							|  |  |  | 	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any))) | 
					
						
							| 
									
										
										
										
											2024-04-03 02:15:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-12 01:26:35 +08:00
										 |  |  | 	layers := llm.Tensors().Layers() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-03 02:15:14 +08:00
										 |  |  | 	switch llm.KV().Architecture() { | 
					
						
							|  |  |  | 	case "llama": | 
					
						
							| 
									
										
										
										
											2024-04-06 05:50:38 +08:00
										 |  |  | 		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		partialOffload = 4 * batch * embedding | 
					
						
							|  |  |  | 		partialOffload += max( | 
					
						
							|  |  |  | 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), | 
					
						
							|  |  |  | 			4*batch*(embedding+vocab)+embedding*vocab*105/128, | 
					
						
							|  |  |  | 		) | 
					
						
							| 
									
										
										
										
											2024-04-12 01:26:35 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 07:57:05 +08:00
										 |  |  | 		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok { | 
					
						
							|  |  |  | 			// mixtral 8x22b
 | 
					
						
							|  |  |  | 			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32)) | 
					
						
							|  |  |  | 			partialOffload = max( | 
					
						
							| 
									
										
										
										
											2024-05-21 00:47:01 +08:00
										 |  |  | 				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV), | 
					
						
							| 
									
										
										
										
											2024-04-23 07:57:05 +08:00
										 |  |  | 				4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch), | 
					
						
							|  |  |  | 			) | 
					
						
							|  |  |  | 		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok { | 
					
						
							|  |  |  | 			// mixtral 8x7b
 | 
					
						
							| 
									
										
										
										
											2024-04-12 01:26:35 +08:00
										 |  |  | 			ffnGateWeight1 := ffnGateWeight.Shape[1] | 
					
						
							|  |  |  | 			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1) | 
					
						
							|  |  |  | 			partialOffload = max( | 
					
						
							|  |  |  | 				4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16, | 
					
						
							|  |  |  | 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16), | 
					
						
							|  |  |  | 			) | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-04-06 05:50:38 +08:00
										 |  |  | 	case "gemma": | 
					
						
							|  |  |  | 		fullOffload = 4 * batch * (embedding + vocab) | 
					
						
							|  |  |  | 		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128 | 
					
						
							|  |  |  | 	case "command-r": | 
					
						
							|  |  |  | 		fullOffload = max( | 
					
						
							|  |  |  | 			4*batch*(embedding+vocab), | 
					
						
							|  |  |  | 			4*batch*(2+4*embedding+context*(1+heads)), | 
					
						
							|  |  |  | 		) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		partialOffload = max( | 
					
						
							|  |  |  | 			4*batch*(embedding+vocab)+embedding*vocab*105/128, | 
					
						
							| 
									
										
										
										
											2024-04-12 01:26:35 +08:00
										 |  |  | 			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16, | 
					
						
							| 
									
										
										
										
											2024-04-06 05:50:38 +08:00
										 |  |  | 		) | 
					
						
							|  |  |  | 	case "qwen2": | 
					
						
							|  |  |  | 		fullOffload = max( | 
					
						
							|  |  |  | 			4*batch*(embedding+vocab), | 
					
						
							|  |  |  | 			4*batch*(1+2*embedding+context+context*heads), | 
					
						
							|  |  |  | 		) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		partialOffload = max( | 
					
						
							|  |  |  | 			4*batch*(embedding+vocab)+embedding*vocab*105/128, | 
					
						
							|  |  |  | 			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)), | 
					
						
							|  |  |  | 		) | 
					
						
							|  |  |  | 	case "phi2": | 
					
						
							|  |  |  | 		fullOffload = max( | 
					
						
							|  |  |  | 			4*batch*(embedding+vocab), | 
					
						
							|  |  |  | 			4*batch*(1+4*embedding+context+context*heads), | 
					
						
							|  |  |  | 		) | 
					
						
							| 
									
										
										
										
											2024-04-04 06:00:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-11 03:13:28 +08:00
										 |  |  | 		partialOffload = max( | 
					
						
							|  |  |  | 			4*batch*(2*embedding+vocab)+embedding*vocab*105/128, | 
					
						
							|  |  |  | 			4*batch*(2+3*embedding+context+context*heads), | 
					
						
							|  |  |  | 		) | 
					
						
							| 
									
										
										
										
											2024-04-18 04:57:19 +08:00
										 |  |  | 	case "stablelm": | 
					
						
							|  |  |  | 		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2) | 
					
						
							|  |  |  | 		partialOffload = max( | 
					
						
							|  |  |  | 			4*batch*(vocab+2*embedding), | 
					
						
							|  |  |  | 			fullOffload, | 
					
						
							|  |  |  | 		) | 
					
						
							| 
									
										
										
										
											2024-04-03 02:15:14 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-06 05:50:38 +08:00
										 |  |  | 	return | 
					
						
							| 
									
										
										
										
											2024-04-03 02:15:14 +08:00
										 |  |  | } |