forked from congtuong/docile
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_split.py
More file actions
23 lines (21 loc) · 787 Bytes
/
data_split.py
File metadata and controls
23 lines (21 loc) · 787 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import glob
import pathlib
import os
import json
def data_split(data_path):
datacreate=["train.json","val.json","trainval.json"]
file = glob.glob(data_path + "/pdfs/*")
l=[]
# resplit data by 8/2
split = 80*len(file)/100
for item in file:
f = pathlib.Path(item).stem
l.append(f)
with open (os.path.join(data_path,"trainval.json"),"w") as f:
f.write(json.dumps(l,ensure_ascii=False))
with open (os.path.join(data_path,"train.json"),"w") as f:
f.write(json.dumps(l[:int(split)],ensure_ascii=False))
with open (os.path.join(data_path,"val.json"),"w") as f:
f.write(json.dumps(l[-(len(file)-int(split)):],ensure_ascii=False))
dataset_path="/home/tip2k4/docile/data/docile/data/docile"
data_split(dataset_path)