Automation_scripts/annot.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
import pandas as pd
from scipy.stats import bartlett
import scipy.stats as stats
from statannotations.Annotator import Annotator
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np
from scikit_posthocs import posthoc_tukey
from io import StringIO
sns.set(style="whitegrid")
df = pd.read_excel('df31.xlsx')
df2=df

# Add a column 'count' to the dataframe, and start it at 0.
df2['count'] =0

# Group by 'Strain', 'Name', and 'Type' and aggregate as count. This will add
# the counts into the 'count' column.
#df2 = df.groupby(['Strain','Name','Type']).count().reset_index()
df2 = df.groupby(['Strain','Name']).count().reset_index()
df21 = df.groupby(['Strain','Name', 'Type']).count().reset_index()
#df3 = df2.groupby(['Strain'])['count'].apply(list)
#AR = f_oneway(*df3)
#df3 = pd.DataFrame(df3).transpose().reset_index()
#Strain = np.unique(df2.Strain)

# Get all unique strains from the df2 into a variable called 'Strain'. Then,
# creat an empty dataframe 'df3' and for each unique value in strain, append
# the df2 values from 'count'. Finally, perform f_oneway statistics on df3.
Strain = df2.Strain.unique()
df3 = []
for s in Strain:
    df3.append(df2[df2['Strain'] == s]['count'])
F = f_oneway(*df3)

# Cast the f_oneway statistics into a string so that it can then be saved as a
# dataframe. Use the StringIO to implement a file-like class on the string
# ('F') so that it can be read in as a CSV and hence become a dataframe.
F = str(F)
F = StringIO(F)
F = pd.read_csv(F, sep=";")

# Perform groupwise comparisons using tukey HSD
tukey = pairwise_tukeyhsd(endog=df2['count'],
                          groups=df2['Strain'],
                          alpha=0.05)

# Save the tukey data as a dataframe and append the F stats from F dataframe
# into it. Finally, export the dataframe to excel as 'Syn_stats.xlsx'.
tukey = pd.DataFrame(data=tukey._results_table.data[1:],
columns=tukey._results_table.data[0])
tukey = tukey.append(F)
tukey.to_excel('Syn_stats.xlsx',index=False)

# In order to annotate the resulting graphs with stars for statistical
# significance, the tukey results need to be put into a simple matrix (just
# showing the p values, not other parameters such as meandiff, lower and upper
# etc).
tukey_df = posthoc_tukey(df2, val_col="count",
group_col="Strain")

# The matrix needs to be converted to a non-redundant list of comparisons with
# the p-value. This is done by removing the lower half and diagonal of the
# matrix and turning the matrix format into a long dataframe using melt(). The
# code and resulting dataframe are shown below.
remove = np.tril(np.ones(tukey_df.shape), k=0).astype("bool")
tukey_df[remove] = np.nan
molten_df = tukey_df.melt(ignore_index=False).reset_index().dropna()

# x, y, and order are defined so that they can be used in the graphs below.
x = "Strain"
y = 'count'
order = ['WT', 'Polg', 'PKO', 'W402A']
sns.set_style("whitegrid", {'axes.grid' : False})
fig, axes = plt.subplots(1, 2,figsize=(17, 4.5))
fig.suptitle('Synaptosomes', weight='bold')
ax = sns.barplot(ax=axes[0],data=df2,x=x, y=y, order=order,
        facecolor=(1,1,1,0),edgecolor='.2')
ax.set_title('All Mutations')

# Add a swarmplot to visualize the individual datapoints on the barplot. Color
# it black so that the points are easy to spot.
ax = sns.swarmplot(ax=axes[0],data=df2,x=x, y=y, order=order,color='.2')
ax.set_ylabel('Number of Mutations')
ax.set_ylim(top=max(df2['count']) + 10)

# In order to only annotate the graph where there are significant differences,
# the dataframe 'molten_df', which contains the p values, will be filtered so
# that only significant p values (<= 0.05) are in there. Note, if all notations
# are desireable, skip the filtering step.
molten_df = molten_df.loc[molten_df['value'] <= 0.05]

# The pairs for multiple comparisons is defined as all strains in the p value
# table.
pairs = [(i[1]["index"], i[1]["variable"]) for i in molten_df.iterrows()]

# A list of p values is generated from the molten_df dataframe. The annotator
# is then defnied and configured to annotate the graph with stars using the p
# values from the list.
p_values = [i[1]["value"] for i  in molten_df.iterrows()]
annotator = Annotator(ax, pairs, data=df2, x=x, y=y, order=order)
annotator.configure(text_format="star", loc="inside")
annotator.set_pvalues_and_annotate(p_values)


#print(type(p_values))
#annotator = Annotator(ax, pairs, data=df2, x=x, y=y, order=order)
#annotator.configure(text_format="star", loc="inside")
#annotator.set_pvalues_and_annotate(p_values)


# reshape the d dataframe suitable for statsmodels package
#df2 = pd.melt(df2.reset_index(), id_vars=['Strain'], value_vars=['count'])
#fvalue, pvalue = stats.f_oneway(df2['Strain'], df2['count'])
# Make a 1 * 2 plot
#f_val, p_val = ss.f_oneway(df2['count'],df2['Strain'])
ax2 = sns.barplot(ax=axes[1],data=df21,x=x, y=y, order=order, hue='Type')
ax2.set_title('Mutations by Type')
ax2.set_ylabel('Number of Mutations')
sns.move_legend(ax2,'upper left',bbox_to_anchor=(1, 1.025))
#annotator = Annotator(ax, pairs, data=df, x=x, y=y, order=order)
#annotator.configure(test='Kruskal', text_format='star', loc='outside')
#annotator.apply_and_annotate()
#add_stat_annotation(ax, data=df, x=x, y=y, order=order, box_pairs=[("WT",
#"Polg"), ("WT", "PKO"), ("WT","W402A")],test='t-test_ind',
#text_format='star', loc='outside',verbose=2)
#stats.bartlett('WT', 'Polg', 'PKO', 'W402A')
#stat, p = bartlett('WT', 'Polg', 'PKO', 'W402A')
plt.savefig('anot.png')

# auto set ylim as y max +.5

# see if there is a way to add p value legend (of only the values used) on the
# graph - verbose controls how much detail is printed, default seems to be 2.
# So somehow caputring the verbose is probably what is needed.