-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_tokenizer.py
More file actions
70 lines (54 loc) · 2.6 KB
/
create_tokenizer.py
File metadata and controls
70 lines (54 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import subprocess
import utils
from tokenizers import Tokenizer, models, trainers
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
cluster = utils.cluster.ClusterManager()
def count_lines_fast(filename: str) -> int:
result = subprocess.run(["wc", "-l", filename], text=True, capture_output=True)
if result.returncode > 0:
raise Exception(f"Error counting lines: {result.stderr}")
line_count = int(result.stdout.split()[0])
return line_count
def train_bpe_tokenizer(dataset_path: str, vocab_size: int = 400) -> None:
# Initialize a tokenizer with BPE model with start and end token
tokenizer_base = Tokenizer(models.BPE())
# Specify special tokens for the tokenizer
special_tokens = ["<pad>", "<s>", "</s>", "<unk>"] # "<s>", "</s>"]
tokenizer_base.add_special_tokens(special_tokens)
tokenizer_base.pre_tokenizer = Whitespace()
tokenizer_base.post_processor = TemplateProcessing(
single="<s> $A </s>",
special_tokens=[("<s>", 1), ("</s>", 2)],
)
# Initialize the trainer with the desired vocabulary size
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
# Get number of sequences in the dataset
text_file_path = os.path.join(dataset_path, "all.txt")
print(f"Generating tokenizer from {text_file_path}")
print(f"File has {count_lines_fast(text_file_path):,} lines.")
print("Query tokens: ")
with open(query_token_path, "r") as f:
for i, line in enumerate(f):
print(line.strip(), end=" ")
print("")
# Train the tokenizer on the dataset
tokenizer_base.train(files=[text_file_path, query_token_path], trainer=trainer)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base)
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
tokenizer.pad_token = "<pad>"
# # Save the tokenizer to the disk
# tokenizer_path = os.path.join(dataset_path, "tokenizer")
tokenizer_path = os.path.join(cluster.artifact_dir, "tokenizers", "TaskData")
os.makedirs(tokenizer_path, exist_ok=True)
# tokenizer.save(tokenizer_path)
tokenizer.save_pretrained(tokenizer_path)
print(f"Tokenizer trained and saved to {tokenizer_path}")
if __name__ == "__main__":
dataset_path = os.path.join(cluster.data_dir, "TaskData", "MOD_mixed_TASK_add_LEN_49_INT_99_CHAR_249_CSL_False")
query_token_path = os.path.join(cluster.data_dir, "TaskData", "query_token.txt")
train_bpe_tokenizer(dataset_path, 400)