-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path3_find_optimals.py
More file actions
148 lines (125 loc) · 6.65 KB
/
3_find_optimals.py
File metadata and controls
148 lines (125 loc) · 6.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import networkx as nx
import evaluate
import os
import pandas as pd
import sys
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description="BetaDescribe: find optimal descriptions")
parser.add_argument("--protein_name", type=str, help="Input protein name", default='protein')
parser.add_argument("--working_dir", type=str, required=True, help="Path to save predictions")
parser.add_argument("--rejection_results_file_name", type=str, help="Rejection results file name", default='rejection_summary')
parser.add_argument("--optimal_results_file_name", type=str, help="Optimal descriptions file name", default='optimal_results')
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
protein_name = args.protein_name
df_path = os.path.join(args.working_dir, f'{protein_name}_{args.rejection_results_file_name}.csv')
df = pd.read_csv(df_path)
df = df[df["is_rejected_origin_and_cell_location"] == True]
chrf = evaluate.load("chrf")
list_of_rows = []
for protein, df_specific_protein in df.groupby(by=["protein_name"]):
protein = protein[0]
df_specific_protein.reset_index(inplace=True)
df_specific_protein['ID'] = df_specific_protein.prediction_type.map(str) + "_" + df_specific_protein.alterntive_num.map(str)
df_specific_protein = df_specific_protein.drop_duplicates(subset=['ID'], keep='last')
df_specific_protein.reset_index(inplace=True)
G = nx.Graph()
edges = {}
for _, row_1 in df_specific_protein.iterrows():
for _, row_2 in df_specific_protein.iterrows():
if not row_1['ID'] in list(G.nodes):
G.add_node(row_1['ID'])
if not row_2['ID'] in list(G.nodes):
G.add_node(row_2['ID'])
if row_1['ID'] == row_2['ID']:
continue
rows_title = f"{row_1['ID']}-{row_2['ID']}"
rows_title_reveresed = f"{row_2['ID']}-{row_1['ID']}"
if rows_title in edges or rows_title_reveresed in edges:
continue
chrf_score_1 = chrf.compute(predictions=[row_1['prediction_str']], references=[row_2['prediction_str']])['score'] / 100
chrf_score_2 = chrf.compute(predictions=[row_2['prediction_str']], references=[row_1['prediction_str']])['score'] / 100
# not a distance but a score
# 0 weight is no link -> the higher the wieght the closer are the nodes
chrf_score = (chrf_score_1 + chrf_score_2) / 2
edges[rows_title] = chrf_score
edges[rows_title_reveresed] = chrf_score
#print(row_1['ID'], row_2['ID'], edges[rows_title])
G.add_edge(row_1['ID'], row_2['ID'], weight = edges[rows_title])
try:
communites = nx.community.louvain_communities(G, seed=123, resolution=1.1)
except Exception as e:
communites = []
pass
backup_communites = None
if len(G.nodes) > 8:
res = 0.9
while res < 1.4:
if not (len(communites) > 10 or len(communites) <= 2):
if len(communites) == 2:
backup_communites = communites
break
#print(res, len(communites))
try:
communites = nx.community.louvain_communities(G, seed=123, resolution=res)
except Exception as e:
print(e)
pass
res += 0.01
if backup_communites:
communites = backup_communites
communites.sort(reverse=True,key=len)
#print(communites)
if len(G.nodes) == 1:
#print(df_specific_protein.iloc[0]['prediction_str'])
prediction_str = df_specific_protein.iloc[0]['prediction_str']
print(f'prediction 1:')
print(f'\t{prediction_str}')
list_of_rows.append({'protein': protein, 'option': 0, 'prediction_str': prediction_str})
continue
if len(communites[0]) > 1:
for idx, communi in enumerate(communites):
if idx >= 3:
break
max_node = (None, -1)
for node1 in communi:
sum_weights = 0
for node2 in communi:
if node1 == node2:
continue
title = f"{node1}-{node2}"
sum_weights += edges[title]
if len(communi) > 1:
average_node = sum_weights / (len(communi) - 1)
else:
average_node = sum_weights
if average_node > max_node[1]:
max_node = (node1, average_node)
prediction_str = df_specific_protein.loc[df_specific_protein['ID'] == max_node[0], 'prediction_str'].item()
print(f'prediction {idx + 1}:')
print(f'\t{prediction_str}')
list_of_rows.append({'protein': protein, 'option': idx + 1, 'prediction_str': prediction_str})
else:
# no communites
nodes_scores = []
for _, row_1 in df_specific_protein.iterrows():
sum_weights = 0
for _, row_2 in df_specific_protein.iterrows():
node1 = row_1['ID']
node2 = row_2['ID']
if node1 == node2:
continue
title = f"{node1}-{node2}"
sum_weights += edges[title]
average_node = sum_weights / (len(df_specific_protein.index) - 1)
nodes_scores.append({'node': node1, 'average': average_node})
nodes_scores.sort(reverse=True, key=lambda d: d['average'])
for idx, node in enumerate(nodes_scores[0:min(len(nodes_scores), 3)]):
prediction_str = df_specific_protein.loc[df_specific_protein['ID'] == node['node'], 'prediction_str'].item()
print(f'prediction {idx + 1}:')
print(f'\t{prediction_str}')
list_of_rows.append({'protein': protein, 'option': idx, 'prediction_str': prediction_str})
results_df = pd.DataFrame(list_of_rows)
results_df.to_csv(os.path.join(args.working_dir, f'{protein_name}_{args.optimal_results_file_name}.csv'), index=False)