-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate_common_db_data.py
More file actions
135 lines (118 loc) · 4.87 KB
/
generate_common_db_data.py
File metadata and controls
135 lines (118 loc) · 4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import json
import numpy as np
import pandas as pd
import tqdm
from generate_pokorny_scraped_data_OLD import remove_non_english_chars
from concepticon import add_concepticon_data
def main():
# open the pokorny and liv files
pokorny_filename = "data_pokorny/table_pokorny.json"
liv_filename = "data_liv/table_liv.json"
with open(pokorny_filename, 'r') as fp:
pokorny_data_list = json.load(fp)
with open(liv_filename, 'r') as fp:
liv_data_list = json.load(fp)
# pokorny and liv need to be redictionaried into key: entry
pokorny_data = {entry["root"]: entry for entry in pokorny_data_list}
pokorny_data_by_id = {entry["entry_id"]: entry for entry in pokorny_data_list}
liv_data = {entry["root"]: entry for entry in liv_data_list}
liv_data_by_id = {entry["entry_id"]: entry for entry in liv_data_list}
# open the match-up csv
match_df = pd.read_csv("data_common/matchup.csv")
# fill the ["liv: cross-reference"] with "" instead of NaN
match_df["liv: cross-reference"] = match_df["liv: cross-reference"].fillna("")
# for everything in the match-up:
# 1. create a new common entry, with a list of objects under the name "dictionary":
# 2. add the pokorny root to that entry in "pokorny_entries" of that list
# 3. if there is a liv, add the liv root to that entry in "liv_entries" of that list
# 3. keep track of which liv roots you have used.
# If there are any left over at the end add them as an entry, with that root in the liv_entries, but nothing in the pokorny_entries
used_liv_roots = set()
common_data = []
liv_to_pokorny = {}
counter = 0
for index, row in tqdm.tqdm(match_df.iterrows()):
pokorny_root = row["root"]
liv_root = row["liv: cross-reference"]
liv_roots = [root.strip() for root in liv_root.split(",")]
# find the pokorny entry in the pokorny data
pokorny_data_entry = pokorny_data[pokorny_root]
# liv_data_entry = liv_data.get(liv_root, None)
new_entry = {
"root": pokorny_root,
"dictionary": [
{"pokorny_entries": [pokorny_root]},
{"liv_entries": liv_roots if liv_root else []}
],
"meaning": pokorny_data_entry["meaning"],
"common_id": str(counter)
}
counter += 1
# add it to the list
common_data.append(new_entry)
for root in liv_roots:
used_liv_roots.add(root)
liv_to_pokorny[root] = pokorny_root
# remove '' from the set of used liv roots and find the
used_liv_roots.remove('')
# set math
unused_liv = set(liv_data.keys()) - used_liv_roots
# add the remaining liv roots to the common data
for root in tqdm.tqdm(unused_liv):
new_entry = {
"root": root,
"dictionary": [
{"pokorny_entries": []},
{"liv_entries": [root]}
],
"meaning": liv_data[root]["meaning"],
"common_id": str(counter)
}
counter += 1
common_data.append(new_entry)
# reorganize the common data into a dictionary by pokorny id
common_data_dict = {entry["root"]: entry for entry in common_data}
# now go back through the pokorny and liv and add a "common_entries" to each
for entry in pokorny_data.values():
root = entry["root"]
common_entry = common_data_dict[root]
entry["common_id"] = common_entry["common_id"]
for entry in pokorny_data.values():
for cross_entry in entry["cross"]:
cross_entry["common_id"] = pokorny_data_by_id[cross_entry["id"]]["common_id"]
for entry in liv_data.values():
root = entry["root"]
if root in liv_to_pokorny:
root = liv_to_pokorny[root]
common_entry = common_data_dict[root]
entry["common_id"] = common_entry["common_id"]
for entry in liv_data.values():
if "cross" in entry:
for cross_entry in entry["cross"]:
cross_entry["common_id"] = liv_data_by_id[cross_entry["id"]]["common_id"]
# sort the common by root
common_data = sorted(common_data, key=lambda x: remove_non_english_chars(x["root"]).lower())
# save the common data
print("writing common")
with open("data_common/table_common.json", 'w') as fp:
json.dump(common_data, fp, indent=4)
# these have to be resaved because they now include "common_id" as a field
# save the pokorny data
print("writing pokorny")
with open("data_pokorny/table_pokorny.json", 'w') as fp:
json.dump(
pokorny_data_list, fp,
# indent=4
)
# save the liv data
print("writing liv")
with open("data_liv/table_liv.json", 'w') as fp:
json.dump(
liv_data_list, fp,
# indent=4
)
add_concepticon_data()
pass
if __name__ == '__main__':
main()
pass