-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpBac_tagmap.II.sh
More file actions
executable file
·140 lines (106 loc) · 5.17 KB
/
pBac_tagmap.II.sh
File metadata and controls
executable file
·140 lines (106 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
# tagmap shell script
# author: David L. Stern
# Janelia Research Campus
# HHMI
# Ashburn, VA
# 27 February 2016
# v. 0.1
#
# Filters and maps reads, finds overlapping forward and reverse reads, plots overlap regions at low and high resolution
# Operates on a folder full of *fastq.gz files generated by the TagMap molecular protocol
# Currently uses bwa-mem for short-read mapping. This works great for most transposons, which duplicate the target site.
#
# Variables:
#
# To detect the orientation of the transposable element or duplicated sequence, provide the first N bp from the 5' of the sequence
#
# dependencies available to system:
# prinseq-lite-0.20.4 (http://prinseq.sourceforge.net)
# Make prinseq-lite executable
# Navigate to folder and type
# chmod 755 prinseq-lite.pl
# bwa 0.7.12-r1039 (http://bio-bwa.sourceforge.net/bwa.shtml)
# samtools 1.3 (http://samtools.sourceforge.net)
# gnuplot 5.0 patchlevel 1 (http://www.gnuplot.info)
# run as ./tagmap <mapping_genome>
mapping_genome=$1
##########################################################
#
# USER SUPPLIED 5' and 3' sequences
#
fiveprimeseq=TTAACCCTAGAAAGATAGTCTGCGTAAAATTGACGCATG #TTAA is duplicated pBac recognition sequence
threeprimeseq=TTAACCCTAGAAAGATAATCATATTGTGACGTACGTTAA
#
#
#
##########################################################
rc_fiveprimeseq=`echo $fiveprimeseq | tr ACGT TGCA | rev`
rc_threeprimeseq=`echo $threeprimeseq | tr ACGT TGCA | rev`
#prepare bwa index of relevant genome
if [ -e ${mapping_genome}.sa ]; then
echo bwa index already built.
else
echo building bwa index.
bwa index $mapping_genome
fi
#unzip all fastq.gz files
gunzip *fastq.gz
for filename in *.fastq; do
#remove PCR duplicates
#make prinseq-lite executable and put in executable
#prinseq automatically appends a fast suffix to output
prinseq-lite.pl -derep 12 -fastq ${filename} -out_good ${filename}.good -out_bad null
#map reads to genome, convert to bam, and sort
#Program: bwa (alignment via Burrows-Wheeler transformation)
#Version: 0.7.12-r1039
bwa mem $mapping_genome ${filename}.good.fastq | samtools view -b /dev/stdin | samtools sort /dev/stdin > ${filename}.good.fastq.sorted.bam
#make bai file
samtools index ${filename}.good.fastq.sorted.bam
# find locations with pBac sequences
samtools view -F 4 ${filename}.good.fastq.sorted.bam | egrep -i ${fiveprimeseq}\|${rc_threeprimeseq}\|${threeprimeseq}\|${rc_fiveprimeseq} > ${filename}.TESites.sam
samtools view -F 4 ${filename}.good.fastq.sorted.bam | egrep -i ${fiveprimeseq}\|${rc_threeprimeseq} > ${filename}.fwdSites.sam
samtools view -F 4 ${filename}.good.fastq.sorted.bam | egrep -i ${threeprimeseq}\|${rc_fiveprimeseq} > ${filename}.revSites.sam
#grab unique positions (chrom & bp)
cut -f 3-4 ${filename}.TESites.sam | uniq > ${filename}.HitSites.txt
#reduce to unique sites within 300bp using Python
python thin.py ${filename}.HitSites.txt
#grab positions (result: chrom, bp, count, orientation)
cut -f 2-4 ${filename}.fwdSites.sam | uniq -c|tr ' ' '\t'|awk '{ print $3 "\t" $4 "\t" $1 "\t" $2} ' > ${filename}.fwdHitSites.txt
cut -f 2-4 ${filename}.revSites.sam | uniq -c|tr ' ' '\t'|awk '{ print $3 "\t" $4 "\t" $1 "\t" $2} ' > ${filename}.revHitSites.txt
# python thin.py ${filename}.fwdHitSites.txt
# python thin.py ${filename}.revHitSites.txt
#Produce two files, with depth for reads in opposite orientation
samtools view -F 0x10 ${filename}.good.fastq.sorted.bam -b| samtools depth /dev/stdin > ${filename}.good.fastq.sorted.fwd.depth
samtools view -f 0x10 ${filename}.good.fastq.sorted.bam -b| samtools depth /dev/stdin > ${filename}.good.fastq.sorted.rev.depth
#print out 2kb flanking around each candidate site using both fwd and rev files
#for each candidate site, grab chromosome and position
old_chrom='null'
while read chrom bp #grab chromosome and bp position
do
if [ $chrom = $old_chrom ]; then
:
else
#grab position and depth from selected chromosome
grep ^$chrom ${filename}.good.fastq.sorted.fwd.depth | cut -f 2-3 > ${filename}.plot_forward
grep ^$chrom ${filename}.good.fastq.sorted.rev.depth | cut -f 2-3 > ${filename}.plot_reverse
grep ^$chrom ${filename}.fwdHitSites.txt | cut -f 2-3 > ${filename}.fwdHits.txt
grep ^$chrom ${filename}.revHitSites.txt | cut -f 2-3 > ${filename}.revHits.txt
old_chrom=$chrom
fi
#plot tview
let start=bp-10
samtools tview -d T -p $chrom:$start ${filename}.good.fastq.sorted.bam > ${filename}.$chrom.$bp.tview.txt
#grab 1kb up and downstream of site from for.depth and rev.depth files
let start=bp-1000
let stop=bp+1001
gnuplot -p -e "set terminal postscript color ;\
set xrange [$start:$stop];\
unset key;\
plot '${filename}.plot_forward' with dots lw 5 linecolor rgb 'red','${filename}.plot_reverse' with dots lw 5 linecolor rgb 'blue',\
'${filename}.fwdHits.txt' with points pointtype 1 linecolor rgb '#F08080',\
'${filename}.revHits.txt' with points pointtype 1 linecolor rgb '#00CED1'"> ${filename}.$chrom.$bp.gnuplot.ps
done < ${filename}.HitSites.txt.candidate_sites
# Comment out following line for troubleshooting
rm ${filename}.good* ${filename}.*HitSites.* ${filename}.*.sam ${filename}.plot* ${filename}.*Hits.txt
done