Fasta-Entrez-Loader/Script.py at main · AceSCav/Fasta-Entrez-Loader · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import subprocess
import os
import xml.etree.ElementTree as ET

current_path = os.getcwd()
database = sys.argv[1]
organism = sys.argv[2]
x = sys.argv[3]

def validade_input():
    """
    Validates the input arguments.

    Checks if the script is executed with the correct number of arguments.
    If not, it prints the correct usage format and exits the program.
    """
    if len(sys.argv) != 4:
        print("Usage: Script.py 'database' 'organism' x")
        print("x is the maximum number of sequences displayed")
        sys.exit(1)

def ids_getter():
    """
    Retrieves sequence IDs from the NCBI Entrez database.

    Uses the eSearch API to search for sequences based on the specified database and organism.
    The retrieved IDs are stored in an XML file named 'Ids.xml'.
    """
    search = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi\?db\="{database}"\&term\="{organism}"\&usehistory\=y\&retmax\={x}'
    file = current_path +'/ids_'+ organism +'.xml'
    name = f'ids_"{organism}"'
    url_maker = f'wget {search} -O {name}.xml'
    subprocess.run(url_maker, shell=True)

def ids_parser():
    """
    Parses the retrieved XML file to extract QueryKey and WebEnv values.
    Returns:
        QueryKey and WebEnv as strings.
    """
    with open(current_path + '/ids_'+ organism +'.xml', 'r') as xml_file:
        ids = xml_file.read()
    root = ET.fromstring(ids)
    QueryKey = None
    WebEnv = None
    if root.find('QueryKey') is not None:
        QueryKey = root.find('QueryKey').text
    if root.find('WebEnv') is not None:
        WebEnv = root.find('WebEnv').text
    return QueryKey, WebEnv

def fasta_printer(QueryKey, WebEnv):
    """
    Retrieves and prints FASTA sequences from the NCBI Entrez database.

    Uses the eFetch API to download the sequences in FASTA format using the QueryKey and WebEnv.
    The retrieved sequences are stored in a file named 'sequences.fasta' and printed to the console.
    """
    fetch = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi\?db\="{database}"\&usehistory\=y\&query_key\={QueryKey}\&WebEnv\={WebEnv}\&rettype\=fasta\&retmax\={x}'
    name = f'sequences_"{organism}"'
    url_maker = f'wget {fetch} -O {name}.fasta'
    printer = f'cat {name}.fasta'
    subprocess.run(url_maker, shell=True)
    return f'"{name}".fasta'

def fasta_renameseq():
    with open(current_path + '/sequences_'+ organism +'.fasta', 'r+') as f:
        lines = f.readlines()
        f.seek(0)
        for line in lines:
            if line.startswith('>'):
                new_name = line.split()[0] + ' ' + line.split()[-1] + '\n'
                f.write(new_name)
            else:
                f.write(line)
            f.truncate()

def mafft_align():
    name = f'sequences_'+ f'{organism}'
    align = f'mafft "{name}.fasta" > algn_"{organism}.fasta"'
    subprocess.run(align, shell=True)


def main():
    """
    Main function to execute the workflow.

    Calls functions in sequence to retrieve sequence IDs, parse them,
    fetch the corresponding FASTA sequences, and print them.
    """
    ids_getter()
    QueryKey, WebEnv = ids_parser()
    fasta_printer(QueryKey, WebEnv)
    print("Dow you want to rename the sequences and aling with mafft? (y/n)")
    answer = input()
    if answer == 'y':
        fasta_renameseq()
        mafft_align()
    else:
        print("removing your system 32 folder")


main()