-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathgenerate_data.py
More file actions
71 lines (57 loc) · 2.09 KB
/
generate_data.py
File metadata and controls
71 lines (57 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# coding=utf-8
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# Code taken from https://github.com/facebookresearch/LLM-QAT
from transformers import AutoTokenizer
import torch
import json
import sys
import os
from absl import flags, app
from ml_collections import config_flags
from llama import get_llm
import numpy as np
from langdetect import detect
FLAGS = flags.FLAGS
config_flags.DEFINE_config_file(
'config',
None,
'File path to the training hyperparameter configuration.',
lock_config=True)
def main(argv):
config = FLAGS.config
np.random.seed(config.seed)
print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=False)
print("Tokenizer loaded!")
print("Loading model")
model = get_llm(config.model, config.cache_dir, config.seqlen)
model = model.cuda()
print("Model loaded!")
for i in range(config.nsamples):
while True:
# generate 5 initial tokens
start_vocab = np.random.randint(tokenizer.vocab_size)
input_ids = torch.tensor([[start_vocab]]).cuda()
outputs1 = model.generate(input_ids, do_sample=False, max_length=5)
text = tokenizer.batch_decode(outputs1, skip_special_tokens=True)[0]
# only accept the English strings. otherwise, re-generate the data
try:
if detect(text) == 'en':
break
except:
pass
# generate the remaining tokens
outputs = model.generate(outputs1, do_sample=True, max_length=2048)
# decode the outputs and save the data
gen_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
text_dict = {"text" : gen_text[0]}
with open( "self_data.jsonl", "a") as f:
f.write(json.dumps(text_dict))
f.write('\n')
if __name__ == '__main__':
flags.mark_flags_as_required(['config'])
app.run(main)