Skip to content

Commit c8ac02f

Browse files
authored
requirements : update transformers to 5.5.1 (ggml-org#21617)
* requirements : update transformers to 5.5.0 This commit updates the transformers dependency to version 5.5.0. The motivation for this is that transformers 5.5.0 includes support for Gemma4 and is required to be able to convert Gemma4 models. This is also causing issues for user of gguf-my-repo. Refs: https://huggingface.co/spaces/ggml-org/gguf-my-repo/discussions/202 * fix huggingface_hub version * set version of transformers to 5.5.0 * convert : add ty ignore directives to convert_hf_to_gguf.py This commit adds `ty: ignore` directives to transformers tokenizers field/methods to avoid type check errors. There might be better ways to handle this and perhaps this can be done in a follow up commit. The motivation for this is that it looks like in transformers 5.5.0 AutoTokenizer.from_pretrained can return generic tokenizer types or None and the type checker now produces an error when the conversion script accesses field like tokenizer.vocab. * convert : add ty ignore to suppress type check errors * convert : remove incorrect type ignores * convert : fix remaining python checks I was running a newer version of ty locally but I've switched to version 0.0.26 which is what CI uses and I was then able to reproduce the errors. Sorry about the noise. * update transformers version to 5.5.1
1 parent 4ef9301 commit c8ac02f

12 files changed

Lines changed: 108 additions & 108 deletions

convert_hf_to_gguf.py

Lines changed: 83 additions & 83 deletions
Large diffs are not rendered by default.

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ def get_existing_models(convert_py):
296296
except Exception as e:
297297
raise OSError(f"Error loading tokenizer for model {name}.") from e
298298

299-
chktok = tokenizer.encode(CHK_TXT)
299+
chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute]
300300
chkhsh = sha256(str(chktok).encode()).hexdigest()
301301

302302
logger.info(f"model: {name}")
@@ -468,7 +468,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
468468

469469
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
470470
for text in tests:
471-
res = tokenizer.encode(text, add_special_tokens=False)
471+
res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute]
472472
for r in res:
473473
f.write(f" {r}")
474474
f.write("\n")

convert_lora_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def set_gguf_parameters(self):
402402
# the invocation string includes the "<|start_of_turn|>"
403403
# token, but the adapters themselves were trained to
404404
# activate _after_ that first token, so we drop it here.
405-
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
405+
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable]
406406
if alora_invocation_tokens:
407407
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
408408
self.gguf_writer.add_key_value(

examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@
5353
print(f"Model name: {model_name}")
5454

5555
prompt = "Hello world today"
56-
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
56+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable]
5757
print(f"Input tokens: {input_ids}")
5858
print(f"Input text: {repr(prompt)}")
59-
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
59+
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute]
6060

6161
with torch.no_grad():
6262
outputs = model(input_ids, output_hidden_states=True)
@@ -92,7 +92,7 @@
9292

9393
# Print embeddings per token in the requested format
9494
print("\nToken embeddings:")
95-
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
95+
tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute]
9696
for i, embedding in enumerate(token_embeddings):
9797
# Format: show first few values, ..., then last few values
9898
if len(embedding) > 10:

examples/model-conversion/scripts/utils/semantic_check.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,8 @@ def main():
207207
else:
208208
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
209209

210-
encoded = tokenizer(prompt, return_tensors="pt")
211-
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
210+
encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable]
211+
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute]
212212
n_tokens = len(tokens)
213213
print(f"n_tokens: {n_tokens}");
214214
print(f"hidden_size: {model.config.hidden_size}")

gguf-py/gguf/vocab.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ def __init__(self, base_path: Path):
543543
cache_dir=base_path,
544544
local_files_only=True,
545545
)
546-
assert self.tokenizer.is_fast # assume tokenizer.json is used
546+
assert self.tokenizer.is_fast # assume tokenizer.json is used # ty: ignore[unresolved-attribute]
547547

548548
# Initialize lists and dictionaries for added tokens
549549
self.added_tokens_list = []
@@ -552,30 +552,30 @@ def __init__(self, base_path: Path):
552552

553553
# Process added tokens
554554
for tok, tokidx in sorted(
555-
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
555+
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] # ty: ignore[unresolved-attribute]
556556
):
557557
# Only consider added tokens that are not in the base vocabulary
558-
if tokidx >= self.tokenizer.vocab_size:
558+
if tokidx >= self.tokenizer.vocab_size: # ty: ignore[unresolved-attribute]
559559
self.added_tokens_list.append(tok)
560560
self.added_tokens_dict[tok] = tokidx
561561
self.added_tokens_ids.add(tokidx)
562562

563563
# Store special tokens and their IDs
564564
self.specials = {
565-
tok: self.tokenizer.get_vocab()[tok]
566-
for tok in self.tokenizer.all_special_tokens
565+
tok: self.tokenizer.get_vocab()[tok] # ty: ignore[unresolved-attribute]
566+
for tok in self.tokenizer.all_special_tokens # ty: ignore[unresolved-attribute]
567567
}
568-
self.special_ids = set(self.tokenizer.all_special_ids)
568+
self.special_ids = set(self.tokenizer.all_special_ids) # ty: ignore[unresolved-attribute]
569569

570570
# Set vocabulary sizes
571-
self.vocab_size_base = self.tokenizer.vocab_size
571+
self.vocab_size_base = self.tokenizer.vocab_size # ty: ignore[unresolved-attribute]
572572
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
573573

574574
self.fname_tokenizer = fname_tokenizer
575575

576576
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
577577
reverse_vocab = {
578-
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
578+
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() # ty: ignore[unresolved-attribute]
579579
}
580580

581581
for token_id in range(self.vocab_size_base):
@@ -616,7 +616,7 @@ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
616616
yield text.encode("utf-8"), score, toktype
617617

618618
def has_newline_token(self):
619-
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
619+
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab # ty: ignore[unresolved-attribute]
620620

621621
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
622622
yield from self.hf_tokens()

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ classifiers = [
1818
python = ">=3.9"
1919
numpy = "^1.25.0"
2020
sentencepiece = ">=0.1.98,<0.3.0"
21-
transformers = ">=4.35.2,<5.0.0"
21+
transformers = "==5.5.1"
2222
protobuf = ">=4.21.0,<5.0.0"
2323
gguf = { path = "./gguf-py" }
2424
torch = { version = "^2.2.0", source = "pytorch" }
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
numpy~=1.26.4
22
sentencepiece>=0.1.98,<0.3.0
33

4-
transformers>=4.57.1,<5.0.0
4+
transformers==5.5.1
55

66
gguf>=0.1.0
77
protobuf>=4.21.0,<5.0.0

requirements/requirements-tool_bench.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
aiohttp~=3.9.3
22
pytest~=8.3.3
3-
huggingface_hub>=0.34.0,<1.0
3+
huggingface_hub>=1.5.0,<2.0
44
matplotlib~=3.10.0
55
numpy~=1.26.4
66
openai~=2.14.0

tests/test-tokenizer-0.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
lines = f.readlines()
2020
s = ''.join(lines)
2121
t_start = time.time()
22-
res = tokenizer.encode(s, add_special_tokens=False)
22+
res = tokenizer.encode(s, add_special_tokens=False) # ty: ignore[unresolved-attribute]
2323
t_end = time.time()
2424
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
2525
with open(fname_out, 'w', encoding='utf-8') as f:

0 commit comments

Comments
 (0)