DeepFavored/identify.py at master · jitang-github/DeepFavored · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
* DeepFavored - A deep learning based method to identify favored(adaptive) mutations in population genomes.
* Copyright (C) 2022 Ji Tang, Hao Zhu
*
* This program is licensed for academic research use only
* unless otherwise stated. Contact jitang1024@outlook.com, zhuhao@smu.edu.cn for
* commercial licensing options.
*
* For use in any publications please cite: Ji Tang, Maosheng Huang, Sha He, Junxiang Zeng, Hao Zhu (2022). Uncovering the extensive trade-off between adaptive evolution and disease susceptibility. Cell Reports, Volume 40, Issue 11, 111351.
"""

import json
import os

import pandas as pd
import tensorflow as tf
from tqdm import tqdm


def Identify(modelDir, inOutFiles):
    print('### Run DeepFavored ...')
    with open(modelDir + '/hyperparams.json') as f:
        confArgs = json.load(f)
    features = confArgs['trainDataHyparams']['componentStats']
    with tf.Graph().as_default() as g:  # use tf.Graph().as_default() to separate differnt Scope
        with tf.compat.v1.Session() as sess:
            saver = tf.compat.v1.train.import_meta_graph(modelDir + '/model/model.meta')  # , clear_devices=True)
            saver.restore(sess, modelDir + '/model/model')
            graph = tf.compat.v1.get_default_graph()

            X_tensor = graph.get_tensor_by_name("X:0")
            H_tensor = graph.get_tensor_by_name("Y1_pred:0")
            O_tensor = graph.get_tensor_by_name("Y2_pred:0")

            for inFile, outFile in tqdm(inOutFiles):
                df = pd.read_csv(inFile, sep='\t')
                df.dropna(inplace=True)
                df_features = df[features]
                X = df_features.values
                positions = df.POS.values
                H, O = sess.run([H_tensor, O_tensor], feed_dict={X_tensor: X})
                rows = []
                for pos, h, o in list(zip(positions, H, O)):
                    h, o = h[0], o[0]
                    DFscore = h * o
                    coor, h, o, DFscore = int(pos), round(h, 3), round(o, 3), round(DFscore, 3)
                    rows.append((pos, h, o, DFscore))

                rows = sorted(rows, key=lambda x: x[0])
                if outFile[-7:] != '.df.out':
                    outFile += '.df.out'
                with open(outFile, 'w') as f:
                    f.write('POS\tDF_H\tDF_O\tDF\n')
                    for row in rows:
                        row = [str(i) for i in row]
                        f.write('\t'.join(row) + '\n')


def IdentifyWithArgs(args):
    inOutFiles = []
    outDir = args.outDir
    if outDir is None:
        if os.path.isfile(args.input):
            outDir = args.input.split('/')[0:-1]
        else:
            outDir = args.input
    os.makedirs(outDir, exist_ok=True)
    if os.path.isfile(args.input):
        fileName = args.input.split('/')[-1]
        inOutFiles.append((args.input, outDir+'/'+fileName))
    else:
        for fileName in os.listdir(args.input):
            inFile = args.input + '/' + fileName
            outFile = outDir + '/' + fileName
            inOutFiles.append((inFile, outFile))
    Identify(args.modelDir, inOutFiles)