mirror of https://github.com/alibaba/MNN.git
507 lines
22 KiB
Python
507 lines
22 KiB
Python
import os
|
|
from gguf import gguf_reader
|
|
from gguf import constants
|
|
import numpy
|
|
import json
|
|
import argparse
|
|
|
|
from utils.mnn_utils import *
|
|
|
|
class TokenContent:
|
|
def __init__(self):
|
|
self.token_type = -1
|
|
self.spec_ids = []
|
|
self.names = []
|
|
self.stop_ids = []
|
|
self.pre_ids = []
|
|
self.token_num = 0
|
|
|
|
def load_token(reader):
|
|
content = TokenContent()
|
|
model = reader.fields['tokenizer.ggml.model'].parts[4].tobytes().decode('utf-8')
|
|
field = reader.fields['tokenizer.ggml.token_type']
|
|
valids = []
|
|
for i in range(0, len(field.data)):
|
|
p = field.data[i]
|
|
if field.parts[p] == 1:
|
|
#normal
|
|
valids.append(i)
|
|
elif field.parts[p] == 3 or field.parts[p] == 4:
|
|
valids.append(i)
|
|
content.spec_ids.append(i)
|
|
tokens = reader.fields['tokenizer.ggml.tokens']
|
|
stopes = ["<|eot_id|>", "<|im_end|>", "<|end|>", "<end_of_turn>", "<|endoftext|>", "<|eom_id|>", "<EOT>"]
|
|
|
|
for i in valids:
|
|
p = tokens.data[i]
|
|
tok = tokens.parts[p].tobytes().decode('utf-8')
|
|
if tok in stopes:
|
|
content.stop_ids.append(i)
|
|
content.names.append(tok)
|
|
content.token_num = len(content.names)
|
|
if model == "gpt2":
|
|
# bpe -> HUGGINGFACE
|
|
content.token_type = 3
|
|
# load merge
|
|
merges = reader.fields['tokenizer.ggml.merges']
|
|
for i in range(0, len(merges.data)):
|
|
p = merges.data[i]
|
|
tok = merges.parts[p].tobytes().decode('utf-8')
|
|
content.names.append(tok)
|
|
elif model == 'llama':
|
|
content.token_type = 1
|
|
else:
|
|
print("[Error] Not support token type: , you can try download tokenizer.txt from old MNN LLM model", model)
|
|
return content
|
|
|
|
def write_token_file(filename, token):
|
|
with open(filename, 'w') as f:
|
|
f.write("430 %d\n" %token.token_type)
|
|
f.write("%d " %(len(token.spec_ids)) + '%d 0\n' %(len(token.stop_ids)))
|
|
l = ""
|
|
for i in token.spec_ids:
|
|
l += "%d " %i
|
|
for i in token.stop_ids:
|
|
l += "%d " %i
|
|
l+='\n'
|
|
f.write(l)
|
|
if token.token_type == 3:
|
|
merge_num = len(token.names) - token.token_num
|
|
f.write("%d " %token.token_num + "%d\n" %merge_num)
|
|
else:
|
|
f.write("%d\n" %token.token_num)
|
|
for name in token.names:
|
|
f.write(name + '\n')
|
|
return
|
|
|
|
def shuffle_weight_int4(weight_main):
|
|
# shuffle weight
|
|
block_number = weight_main.shape[0]
|
|
half_block_size = weight_main.shape[1]
|
|
weight_main_low = weight_main % 16
|
|
weight_main_high = weight_main // 16
|
|
weight_main = numpy.concatenate([weight_main_low, weight_main_high], axis = 1).reshape([block_number, half_block_size, 2])
|
|
weight_main_low = weight_main[:, :, 1]
|
|
weight_main_high = weight_main[:, :, 0]
|
|
weight_main = weight_main_low + weight_main_high * 16
|
|
return weight_main
|
|
|
|
# const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
# const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
|
|
# const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
|
|
# const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
|
|
|
# y[i*qk + j + 0 ] = x0*d;
|
|
# y[i*qk + j + qk/2] = x1*d;
|
|
|
|
def shuffle_weight_int5(weight, repack = True):
|
|
block_number = weight.shape[0]
|
|
qh = weight[:, 0:4]
|
|
qs = weight[:, 4:20]
|
|
x0 = qs & 0x0F
|
|
x1 = qs >> 4
|
|
qh = numpy.frombuffer(qh.tobytes(), numpy.uint32).reshape([block_number, 1])
|
|
mask_0 = []
|
|
mask_1 = []
|
|
for i in range(0, 16):
|
|
mask_0.append(((qh >> i)<< 4) & 0x10)
|
|
mask_1.append(((qh >> (i+12))) & 0x10)
|
|
mask_0 = numpy.concatenate(mask_0, axis=1)
|
|
mask_1 = numpy.concatenate(mask_1, axis=1)
|
|
x0 = x0 + mask_0
|
|
x1 = x1 + mask_1
|
|
x = numpy.concatenate([x0, x1], axis=1)
|
|
if repack:
|
|
return repack_low_bits(x, 5, 32)
|
|
return x
|
|
|
|
def extract_tensor_as_int8(weight):
|
|
ic = int(weight.shape[0])
|
|
oc = int(weight.shape[1])
|
|
if weight.tensor_type == constants.GGMLQuantizationType.Q6_K:
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
block_number = oc * ic // block_size
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
scale_int8 = weight[:, 192:208]
|
|
scale_half = weight[:, 208:210]
|
|
scale_int8 = numpy.frombuffer(scale_int8.tobytes(), numpy.int8).astype(numpy.float32).reshape([block_number, 16, 1])
|
|
scale_half = numpy.frombuffer(scale_half.tobytes(), numpy.float16).astype(numpy.float32).reshape([block_number, 1, 1])
|
|
weight_scale = scale_half * scale_int8
|
|
|
|
# Extract to int8
|
|
ql = weight[:, 0:128]
|
|
qh = weight[:, 128:192]
|
|
|
|
qall = []
|
|
for i in range(256):
|
|
qall.append(None)
|
|
for nnp in range(0, 2):
|
|
for l in range(0, 32):
|
|
q1 = ((ql[:, l + 0 + 64 * nnp] & 0xF) | (((qh[:, l + 32*nnp] >> 0) & 3) << 4))
|
|
q2 = ((ql[:, l + 32 + 64 * nnp] & 0xF) | (((qh[:, l + 32*nnp] >> 2) & 3) << 4))
|
|
q3 = ((ql[:, l + 0 + 64 * nnp] >> 4) | (((qh[:, l + 32*nnp] >> 4) & 3) << 4))
|
|
q4 = ((ql[:, l + 32 + 64 * nnp] >> 4) | (((qh[:, l + 32*nnp] >> 6) & 3) << 4))
|
|
qall[l + 0 + 128 * nnp] = q1.reshape([block_number, 1])
|
|
qall[l + 32 + 128 * nnp] = q2.reshape([block_number, 1])
|
|
qall[l + 64 + 128 * nnp] = q3.reshape([block_number, 1])
|
|
qall[l + 96 + 128 * nnp] = q4.reshape([block_number, 1])
|
|
q_raw = numpy.concatenate(qall, axis = 1)
|
|
return q_raw, weight_scale, 16, 6
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q5_0:
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
# Seperate Scale and Bias
|
|
weight_main = weight[:, 2:type_size]
|
|
weight_main = shuffle_weight_int5(weight_main, False)
|
|
weight_scale = weight[:, 0:2]
|
|
weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
|
|
return weight_main, weight_scale, 32, 5
|
|
return None
|
|
|
|
def write_external_weight(weight, mnn_weight_file, mnn_weight_offset):
|
|
ic = int(weight.shape[0])
|
|
oc = int(weight.shape[1])
|
|
bias_length = oc * 4
|
|
conv = {}
|
|
block_size = 0
|
|
block_number = 0
|
|
quant_bit = 0
|
|
tie_embedding = False
|
|
header_len = 0
|
|
if weight.tensor_type == constants.GGMLQuantizationType.F16:
|
|
# FP16
|
|
quan = {}
|
|
quan['type'] = 3
|
|
conv['quanParameter'] = quan
|
|
rawbytes = weight.data.tobytes()
|
|
weightlen = mnn_weight_file.write(rawbytes)
|
|
external = [mnn_weight_offset, weightlen, 0, bias_length, 0]
|
|
conv['external'] = external
|
|
mnn_weight_offset += weightlen
|
|
tie_embedding = True
|
|
quant_bit = 16
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.F32:
|
|
# FP16
|
|
quan = {}
|
|
quan['type'] = 3
|
|
conv['quanParameter'] = quan
|
|
rawbytes = weight.data.astype(numpy.float16).tobytes()
|
|
weightlen = mnn_weight_file.write(rawbytes)
|
|
external = [mnn_weight_offset, weightlen, 0, bias_length, 0]
|
|
conv['external'] = external
|
|
mnn_weight_offset += weightlen
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q4_0:
|
|
tie_embedding = True
|
|
quant_bit = 4
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
block_number = oc * ic // block_size
|
|
weight = weight.data.reshape([block_number, type_size])
|
|
# Seperate Scale and Bias
|
|
weight_main = weight[:, 2:type_size]
|
|
weight_scale = weight[:, 0:2]
|
|
weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
|
|
|
|
# shuffle weight
|
|
weight_main = shuffle_weight_int4(weight_main)
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q4_1:
|
|
quant_bit = 4
|
|
tie_embedding = True
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
block_number = oc * ic // block_size
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
# Seperate Scale and Bias
|
|
weight_main = weight[:, 4:type_size]
|
|
|
|
# shuffle weight
|
|
weight_main = shuffle_weight_int4(weight_main);
|
|
|
|
weight_scale = weight[:, 0:2]
|
|
weight_bias = weight[:, 2:4]
|
|
weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).reshape((block_number, 1))
|
|
weight_bias = numpy.frombuffer(weight_bias.tobytes(), numpy.float16).reshape((block_number, 1))
|
|
scalebias = numpy.concatenate((weight_bias, weight_scale), axis=1).astype(numpy.float32)
|
|
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, True, mnn_weight_file, ic, oc, weight_main, scalebias, mnn_weight_offset)
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q4_K:
|
|
quant_bit = 4
|
|
tie_embedding = True
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
block_number = oc * ic // block_size
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
# Seperate Scale and Bias
|
|
d = weight[:, 0:2]
|
|
dmin = weight[:, 2:4]
|
|
scales = weight[:, 4:16]
|
|
weight_main = weight[:, 16:type_size]
|
|
|
|
# shuffle weight
|
|
weight_main = weight_main.reshape((block_number * 4, 32))
|
|
weight_main = shuffle_weight_int4(weight_main)
|
|
|
|
# Compute Scale
|
|
d = numpy.frombuffer(d.tobytes(), numpy.float16).reshape((block_number, 1)).astype(numpy.float32)
|
|
dmin = numpy.frombuffer(dmin.tobytes(), numpy.float16).reshape((block_number, 1)).astype(numpy.float32)
|
|
|
|
def get_scale_min_k4(j, q):
|
|
if j < 4:
|
|
d = q[:, j] & 63
|
|
m = q[:, j + 4] & 63
|
|
else:
|
|
d = (q[:, j+4] & 0xF) | ((q[:, j-4] >> 6) << 4)
|
|
m = (q[:, j+4] >> 4) | ((q[:, j-0] >> 6) << 4)
|
|
return d, m
|
|
dgroup=[]
|
|
mgroup=[]
|
|
for j in range(0, 8):
|
|
dgroup.append(None)
|
|
mgroup.append(None)
|
|
for j in range(0, 8):
|
|
vd, vm = get_scale_min_k4(j, scales)
|
|
vd = vd.reshape((block_number, 1))
|
|
vm = vm.reshape((block_number, 1))
|
|
vd = vd.astype(numpy.float32) * d
|
|
vm = vm.astype(numpy.float32) * dmin
|
|
dgroup[j] = vd
|
|
mgroup[j] = -vm
|
|
weight_scale = numpy.concatenate(dgroup, -1).reshape((block_number, 8, 1))
|
|
weight_bias = numpy.concatenate(mgroup, -1).reshape((block_number, 8, 1))
|
|
scalebias = numpy.concatenate((weight_bias, weight_scale), axis=-1).astype(numpy.float32)
|
|
|
|
|
|
block_size = 32
|
|
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, True, mnn_weight_file, ic, oc, weight_main, scalebias, mnn_weight_offset)
|
|
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q8_0:
|
|
quant_bit = 8
|
|
tie_embedding = True
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
# Seperate Scale and Bias
|
|
weight_main = weight[:, 2:type_size]
|
|
weight_scale = weight[:, 0:2]
|
|
weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
|
|
weight_main = numpy.frombuffer(weight_main.tobytes(), numpy.int8).astype(numpy.int16) + 128
|
|
weight_main = weight_main.astype(numpy.uint8)
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
|
|
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q5_0:
|
|
tie_embedding = False
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
# Seperate Scale and Bias
|
|
weight_main = weight[:, 2:type_size]
|
|
weight_main = shuffle_weight_int5(weight_main)
|
|
weight_scale = weight[:, 0:2]
|
|
weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
|
|
quant_bit = 5
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
|
|
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q5_1:
|
|
tie_embedding = False
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
block_number = oc * ic // block_size
|
|
weight = weight.data.reshape([oc * ic // block_size, type_size])
|
|
# Seperate Scale and Bias
|
|
weight_main = weight[:, 4:type_size]
|
|
weight_main = shuffle_weight_int5(weight_main)
|
|
weight_scale = weight[:, 0:2]
|
|
weight_bias = weight[:, 2:4]
|
|
weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).reshape((block_number, 1))
|
|
weight_bias = numpy.frombuffer(weight_bias.tobytes(), numpy.float16).reshape((block_number, 1))
|
|
weight_scale = numpy.concatenate((weight_bias, weight_scale), axis=1).astype(numpy.float32)
|
|
quant_bit = 5
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, True, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q6_K:
|
|
block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
|
|
block_number = oc * ic // block_size
|
|
q_raw, weight_scale, block_size, bits = extract_tensor_as_int8(weight)
|
|
weight_main = repack_low_bits(q_raw, 6, 256)
|
|
quant_bit = 6
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
|
|
|
|
else:
|
|
print('Not support type: ', weight.tensor_type)
|
|
print(weight.data.shape, ic, oc)
|
|
assert(False)
|
|
return mnn_weight_offset, conv, tie_embedding, block_size, quant_bit, header_len
|
|
|
|
def convert(args):
|
|
gguf = args.gguf
|
|
mnn_dir = args.mnn_dir
|
|
src_json = os.path.join(mnn_dir, "llm.mnn.json")
|
|
dst_json = os.path.join(mnn_dir, "llm.mnn_new.json")
|
|
|
|
mnn, opmap, convs, _, __ = load_mnn(src_json)
|
|
llm_config = {}
|
|
with open(os.path.join(mnn_dir, "llm_config.json")) as f:
|
|
llm_config = json.load(f)
|
|
|
|
reader = gguf_reader.GGUFReader(gguf)
|
|
if args.load_token:
|
|
write_token_file(os.path.join(mnn_dir, "tokenizer.txt"), load_token(reader))
|
|
arch = reader.fields['general.architecture'].parts[4].tobytes().decode('utf-8')
|
|
print("Arch:", arch)
|
|
tensormap = {}
|
|
for t in reader.tensors:
|
|
tensormap[t.name] = t
|
|
|
|
mnn_weight_file = open(os.path.join(mnn_dir, "llm.mnn.weight"), "wb")
|
|
mnn_weight_offset = 0
|
|
if 'tie_embeddings' in llm_config:
|
|
del llm_config['tie_embeddings']
|
|
for name in opmap:
|
|
op = opmap[name]
|
|
print('Load layernorm: ', name)
|
|
if op['type'] == 'LayerNorm':
|
|
weight_tensor = tensormap[name+'.weight']
|
|
layernorm = op['main']
|
|
layernorm['gamma'] = weight_tensor.data.tolist()
|
|
if name+'.bias' in tensormap:
|
|
layernorm['beta'] = tensormap[name+'.bias'].data.tolist()
|
|
else:
|
|
layernorm['beta'] = [0.0] * len(layernorm['gamma'])
|
|
continue
|
|
for op in convs:
|
|
conv = op['main']
|
|
name = op['name']
|
|
if 'quanParameter' in conv:
|
|
del conv['quanParameter']
|
|
weight_name = name+'.weight'
|
|
weight = None
|
|
tie_embedding = False
|
|
ichannel = conv['common']['inputCount']
|
|
ochannel = conv['common']['outputCount']
|
|
if name == 'output':
|
|
print('hidden size: ', ichannel)
|
|
llm_config['hidden_size'] = ichannel
|
|
if weight_name in tensormap:
|
|
weight = tensormap[weight_name]
|
|
elif name == 'output':
|
|
weight = tensormap['token_embd.weight']
|
|
tie_embedding = True
|
|
else:
|
|
print("Error: Can't find weight for " + name)
|
|
assert(False)
|
|
print('Load Convolution: ', name, ", weight type: ", weight.tensor_type)
|
|
if weight.shape[0] != ichannel or weight.shape[1] != ochannel:
|
|
print(name, ", weight not match: ", ichannel, ", ", ochannel, " : ", weight.shape, ", reset to ", weight.shape)
|
|
ichannel = int(weight.shape[0])
|
|
ochannel = int(weight.shape[1])
|
|
conv['common']['inputCount'] = ichannel
|
|
conv['common']['outputCount'] = ochannel
|
|
# Change post reshape for convolution
|
|
outputIndex = op['outputIndexes'][0]
|
|
for subop in mnn["oplists"]:
|
|
if 'inputIndexes' not in subop:
|
|
continue
|
|
if subop['inputIndexes'][0] == outputIndex and subop['type'] == 'ConvertTensor':
|
|
outputIndex = subop['outputIndexes'][0]
|
|
break
|
|
for subop in mnn["oplists"]:
|
|
if 'inputIndexes' not in subop:
|
|
continue
|
|
if subop['inputIndexes'][0] == outputIndex and subop['type'] == 'Reshape':
|
|
subop['main']['dims'][2] = ochannel
|
|
break
|
|
mnn_weight_offset, conv_new, can_tie_embedding, block_size, quant_bit, header_len = write_external_weight(weight, mnn_weight_file, mnn_weight_offset)
|
|
if not can_tie_embedding:
|
|
tie_embedding = False
|
|
conv['quanParameter'] = conv_new['quanParameter']
|
|
conv['external'] = conv_new['external']
|
|
|
|
bias = None
|
|
bias_name = name + '.bias'
|
|
if bias_name in tensormap:
|
|
if tensormap[bias_name].tensor_type > 1:
|
|
print('Error: Bias is quant: ', tensormap[bias_name].tensor_type)
|
|
assert(False)
|
|
bias = tensormap[bias_name].data.astype(numpy.float32)
|
|
else:
|
|
bias = numpy.zeros(ochannel).astype(numpy.float32)
|
|
mnn_weight_offset += mnn_weight_file.write(bias.tobytes())
|
|
if tie_embedding:
|
|
external = conv['external']
|
|
weight_offset = external[0] + header_len
|
|
alpha_offset = external[0] + external[1]
|
|
alpha_size = external[2]
|
|
llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, 32]
|
|
embedding_file = os.path.join(mnn_dir, "embeddings_bf16.bin")
|
|
|
|
embeding_in_weight = True
|
|
if 'tie_embeddings' not in llm_config:
|
|
# Need write embedding
|
|
weight = tensormap['token_embd.weight']
|
|
print("Embedding type: ", weight.tensor_type)
|
|
if weight.tensor_type <= 1:
|
|
embeding_in_weight = False
|
|
print("Write ", embedding_file)
|
|
weight = weight.data.astype(numpy.float32)
|
|
weight = numpy.frombuffer(weight.tobytes(), numpy.uint32) >> 16
|
|
weight = weight.astype(numpy.uint16)
|
|
with open(embedding_file, 'wb') as f:
|
|
f.write(weight.tobytes())
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q8_0 or weight.tensor_type == constants.GGMLQuantizationType.Q4_0 or weight.tensor_type == constants.GGMLQuantizationType.Q4_1:
|
|
mnn_weight_offset, conv, can_tie_embedding, block_size, quant_bit, header_len = write_external_weight(weight, mnn_weight_file, mnn_weight_offset)
|
|
external = conv['external']
|
|
weight_offset = external[0] + header_len
|
|
alpha_offset = external[0] + external[1]
|
|
alpha_size = external[2]
|
|
llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, block_size]
|
|
elif weight.tensor_type == constants.GGMLQuantizationType.Q6_K or weight.tensor_type == constants.GGMLQuantizationType.Q5_0:
|
|
q_raw, weight_scale, block_size, bits = extract_tensor_as_int8(weight)
|
|
# embeding_in_weight = False
|
|
ic = int(weight.shape[0])
|
|
oc = int(weight.shape[1])
|
|
offset = (1 << (bits - 1))
|
|
q_raw = repack_low_bits(q_raw, 8, q_raw.shape[1])
|
|
q_raw = q_raw + (128-offset)
|
|
quant_bit = 8
|
|
conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, q_raw, weight_scale, mnn_weight_offset)
|
|
external = conv['external']
|
|
weight_offset = external[0] + header_len
|
|
alpha_offset = external[0] + external[1]
|
|
alpha_size = external[2]
|
|
llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, block_size]
|
|
else:
|
|
assert(False)
|
|
|
|
if embeding_in_weight:
|
|
if os.path.exists(embedding_file):
|
|
os.remove(embedding_file)
|
|
|
|
mnn_weight_file.close()
|
|
with open(dst_json, 'w') as f:
|
|
f.write(json.dumps(mnn, indent=4))
|
|
with open(os.path.join(mnn_dir, "llm_config.json"), 'w') as f:
|
|
f.write(json.dumps(llm_config, indent=4))
|
|
|
|
convert_args = [
|
|
'',
|
|
'-f',
|
|
'JSON',
|
|
'--modelFile',
|
|
dst_json,
|
|
'--MNNModel',
|
|
os.path.join(mnn_dir, 'llm.mnn'),
|
|
]
|
|
|
|
print(convert_args)
|
|
from MNN.tools import mnnconvert
|
|
mnnconvert.convert(convert_args)
|
|
os.remove(dst_json)
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='gguf2mnn', formatter_class=argparse.RawTextHelpFormatter)
|
|
parser.add_argument('--gguf', type=str, required=True,help='src gguf model')
|
|
parser.add_argument('--mnn_dir', type=str, required=True,help='mnn llm dir')
|
|
parser.add_argument('--load_token', type=bool, default = False, help='Override tokenizer.txt from gguf')
|
|
args = parser.parse_args()
|
|
import time
|
|
sta = time.time()
|
|
convert(args)
|
|
fin = time.time()
|
|
print("Cost time ", fin - sta, " s")
|