-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython functionality workshop.txt
More file actions
151 lines (123 loc) · 5.01 KB
/
python functionality workshop.txt
File metadata and controls
151 lines (123 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Question 1
## Write a programme (or script) that will produce a list of sequence headers from a multi-fasta file.
# Creates a function to extract headers
def extract_headers(fasta_file):
# creates a variable called Header
headers = []
# Use with function to access file
with open(fasta_file, 'r') as file:
for line in file:
# If statement to select the lines starting with ">"
if line.startswith('>'):
headers.append(line.strip().lstrip('>')) # Remove whitespaces and > sign
return headers
# Function parameters
fasta_file = "Downloads/workshop_data.fasta"
# Apply function to the header variable
headers = extract_headers(fasta_file)
# Print the list
print(headers)
# Question 2
## Using the header list you created in question 1, number the sequence headers alphabetically.
# Create a counter variable
count = 0
# For loop to check each line in the list
for items in sorted(headers):
# Statement to ensure that the counter iterates over itself, with the lowest value been 0
count = count + 1
# Print results
print(count, items)
# Question 3
# Create an empty dictionary to store the sequences
seq_dict = {}
# Access the fasta file
with open("Downloads/workshop_data.fasta", "r") as fastafile:
# Create a variable to hold the current header
header = None
# Create a list to hold the sequence lines
sequence = []
# For statement to examine each line in the file
for line in fastafile:
# Remove any whitespace
line = line.strip()
# If statement to check if the line is a header
if line.startswith(">"):
# If the line is an header, save the sequence to the header variable
if header is not None:
# Join the sequence list into a string
seq_dict[header] = ''.join(sequence)
# remove the '>' character
header = line[1:]
# Reset the sequence list for the new header
sequence = []
# Else statement
else:
# Add the line to the sequence list
sequence.append(line)
# Add the last sequence after the loop
if header is not None:
seq_dict[header] = ''.join(sequence)
# Now seq_dict contains the headers and their corresponding sequences
# Print the dictionary to verify its contents
print(seq_dict)
# Question 4
## Re-write your code from question 3, so that it takes the form of a function.
# Create a function to convert fasta files into a dictionary
def fasta_to_dict(fasta_file):
"""
Reads a FASTA file and creates a dictionary with headers as keys and sequences as values.
Function Parameters:
fasta_file (str): Where "str" is the FASTA file path.
Returns:
dict: A dictionary with headers as keys and sequences as values.
"""
# Create an empty dictionary to store the sequences
seq_dict = {}
# Access the fasta file
with open(fasta_file, "r") as fastafile:
# create a variable to hold the current header
header = None
# Create a list to hold the sequence lines
sequence = []
# For Loop to read the lines
for line in fastafile:
# Remove any whitespace
line = line.strip()
# If statement to check if the line is a header
if line.startswith(">"):
# If the line is an header, save the sequence to the header variable
if header is not None:
# Join the sequence list into a string
seq_dict[header] = ''.join(sequence)
# Remove the '>' character
header = line[1:]
# Reset the sequence list for the new header
sequence = []
# Else statement
else:
# Add the line variables to the sequence list
sequence.append(line)
# Add the last sequence after the loop
if header is not None:
seq_dict[header] = ''.join(sequence)
# Return the dictionary
return seq_dict
# Test function
fasta_dict = fasta_to_dict("Downloads/workshop_data.fasta")
print(fasta_dict)
# Question 5
## How would you access individual elements of a dictionary created through the function you wrote in question 4?
# The fasta-file-into-dictionary function has been used to create a variable "fasta_dict"
# Method 1
# Use For loop to access the keys
for elements in fasta_dict.keys():
# Print results, which includes the keys and values
print(elements, fasta_dict[items])
# Method 2
# Use dict.items function to access the dictionary as a tuple
print(fasta_dict.items())
# Method 3
# Use for loop to access both keys and values of the dictionary via the dict.item function.
for k, v in fasta_dict.items():
# Print statement which uses an equal to sign to join keys and values
print(k, "=", v)