19
19
sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' / 'gguf' ))
20
20
import gguf
21
21
22
- # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23
-
24
-
25
- def bytes_to_unicode ():
26
- """
27
- Returns list of utf-8 byte and a corresponding list of unicode strings.
28
- The reversible bpe codes work on unicode strings.
29
- This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30
- When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31
- This is a significant percentage of your normal, say, 32K bpe vocab.
32
- To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33
- And avoids mapping to whitespace/control characters the bpe code barfs on.
34
- """
35
- bs = list (range (ord ("!" ), ord ("~" )+ 1 ))+ list (range (ord ("¡" ), ord ("¬" )+ 1 ))+ list (range (ord ("®" ), ord ("ÿ" )+ 1 ))
36
- cs = bs [:]
37
- n = 0
38
- for b in range (2 ** 8 ):
39
- if b not in bs :
40
- bs .append (b )
41
- cs .append (2 ** 8 + n )
42
- n += 1
43
- return dict (zip (bs , (chr (n ) for n in cs )))
44
-
45
22
46
23
def count_model_parts (dir_model : Path ) -> int :
47
24
num_parts = 0
@@ -131,6 +108,8 @@ def parse_args() -> argparse.Namespace:
131
108
print ("gguf: get tokenizer metadata" )
132
109
133
110
tokens : list [bytearray ] = []
111
+ scores : list [float ] = []
112
+ toktypes : list [int ] = []
134
113
135
114
tokenizer_json_file = dir_model / 'tokenizer.json'
136
115
if not tokenizer_json_file .is_file ():
@@ -155,31 +134,15 @@ def parse_args() -> argparse.Namespace:
155
134
tokenizer = AutoTokenizer .from_pretrained (dir_model )
156
135
157
136
reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .vocab .items ()}
158
- byte_encoder = bytes_to_unicode ()
159
- byte_decoder = {v : k for k , v in byte_encoder .items ()}
160
137
161
138
for i in range (vocab_size ):
162
- if i in reverse_vocab :
163
- try :
164
- text = bytearray ([byte_decoder [c ] for c in reverse_vocab [i ]])
165
- except KeyError :
166
- text = bytearray ()
167
- for c in reverse_vocab [i ]:
168
- if ord (c ) < 256 : # single byte character
169
- try :
170
- text .append (byte_decoder [c ])
171
- except KeyError :
172
- text .extend (c .encode ('utf-8' ))
173
- else : # multibyte special token character
174
- text .extend (c .encode ('utf-8' ))
175
- else :
176
- print (f"Key { i } not in tokenizer vocabulary. Padding with an arbitrary token. (It's normal for MPT.)" )
177
- pad_token = f"[PAD{ i } ]" .encode ("utf8" )
178
- text = bytearray (pad_token )
179
-
180
- tokens .append (text )
139
+ tokens .append (reverse_vocab [i ] if i in reverse_vocab else f"[PAD{ i } ]" )
140
+ scores .append (0.0 ) # dummy
141
+ toktypes .append (gguf .TokenType .NORMAL )
181
142
182
143
gguf_writer .add_token_list (tokens )
144
+ gguf_writer .add_token_scores (scores )
145
+ gguf_writer .add_token_types (toktypes )
183
146
184
147
special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
185
148
special_vocab .add_to_gguf (gguf_writer )
@@ -239,10 +202,6 @@ def parse_args() -> argparse.Namespace:
239
202
240
203
print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
241
204
242
-
243
- # if new_name == "wte.weight" and data.shape[0] == 50432 and vocab_size == 50254:
244
- # data = data[0:vocab_size,:]
245
-
246
205
gguf_writer .add_tensor (new_name , data )
247
206
248
207
# note: MPT output is tied to (same as) wte in original model;
0 commit comments