TagMap/pBac_tagmap.II.sh at master · dstern/TagMap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash

# tagmap shell script
# author: 	David L. Stern
# 			Janelia Research Campus
#			HHMI
#			Ashburn, VA
#			27 February 2016
#			v. 0.1
#
# Filters and maps reads, finds overlapping forward and reverse reads, plots overlap regions at low and high resolution
# Operates on a folder full of *fastq.gz files generated by the TagMap molecular protocol
# Currently uses bwa-mem for short-read mapping. This works great for most transposons, which duplicate the target site.
#
# Variables:
#
# To detect the orientation of the transposable element or duplicated sequence, provide the first N bp from the 5' of the sequence
#
# dependencies available to system:
#	prinseq-lite-0.20.4 (http://prinseq.sourceforge.net)
#		Make prinseq-lite executable
#			Navigate to folder and type
#			chmod 755 prinseq-lite.pl
#	bwa 0.7.12-r1039 (http://bio-bwa.sourceforge.net/bwa.shtml)
#	samtools 1.3 (http://samtools.sourceforge.net)
#	gnuplot 5.0 patchlevel 1 (http://www.gnuplot.info)

# run as ./tagmap <mapping_genome>

mapping_genome=$1

##########################################################
#
# USER SUPPLIED 5' and 3' sequences
#

fiveprimeseq=TTAACCCTAGAAAGATAGTCTGCGTAAAATTGACGCATG #TTAA is duplicated pBac recognition sequence
threeprimeseq=TTAACCCTAGAAAGATAATCATATTGTGACGTACGTTAA

#
#
#
##########################################################


rc_fiveprimeseq=`echo $fiveprimeseq | tr ACGT TGCA | rev`
rc_threeprimeseq=`echo $threeprimeseq | tr ACGT TGCA | rev`


#prepare bwa index of relevant genome

if [ -e ${mapping_genome}.sa ]; then
	echo bwa index already built.
else
	echo building bwa index.
	bwa index $mapping_genome
fi

#unzip all fastq.gz files

gunzip *fastq.gz

for filename in *.fastq; do

	#remove PCR duplicates
	#make prinseq-lite executable and put in executable
	#prinseq automatically appends a fast suffix to output
	prinseq-lite.pl -derep 12 -fastq ${filename} -out_good ${filename}.good -out_bad null

	#map reads to genome, convert to bam, and sort
	#Program: bwa (alignment via Burrows-Wheeler transformation)
	#Version: 0.7.12-r1039
	bwa mem $mapping_genome ${filename}.good.fastq | samtools view -b /dev/stdin | samtools sort /dev/stdin > ${filename}.good.fastq.sorted.bam

	#make bai file

	samtools index ${filename}.good.fastq.sorted.bam

	# find locations with pBac sequences
	samtools view -F 4 ${filename}.good.fastq.sorted.bam | egrep -i ${fiveprimeseq}\|${rc_threeprimeseq}\|${threeprimeseq}\|${rc_fiveprimeseq} > ${filename}.TESites.sam

	samtools view -F 4 ${filename}.good.fastq.sorted.bam | egrep -i ${fiveprimeseq}\|${rc_threeprimeseq} > ${filename}.fwdSites.sam
	samtools view -F 4 ${filename}.good.fastq.sorted.bam | egrep -i ${threeprimeseq}\|${rc_fiveprimeseq}  > ${filename}.revSites.sam

	#grab unique positions (chrom & bp)
	cut -f 3-4 ${filename}.TESites.sam | uniq > ${filename}.HitSites.txt

	#reduce to unique sites within 300bp using Python
	python thin.py ${filename}.HitSites.txt

	#grab positions (result: chrom, bp, count, orientation)
	cut -f 2-4 ${filename}.fwdSites.sam | uniq -c|tr ' ' '\t'|awk '{ print $3 "\t" $4 "\t" $1 "\t" $2} ' > ${filename}.fwdHitSites.txt
	cut -f 2-4 ${filename}.revSites.sam | uniq -c|tr ' ' '\t'|awk '{ print $3 "\t" $4 "\t" $1 "\t" $2} ' > ${filename}.revHitSites.txt


#	python thin.py ${filename}.fwdHitSites.txt
#	python thin.py ${filename}.revHitSites.txt

	#Produce two files, with depth for reads in opposite orientation
	samtools view -F 0x10 ${filename}.good.fastq.sorted.bam -b| samtools depth /dev/stdin > ${filename}.good.fastq.sorted.fwd.depth
	samtools view -f 0x10 ${filename}.good.fastq.sorted.bam -b| samtools depth /dev/stdin  > ${filename}.good.fastq.sorted.rev.depth

	#print out 2kb flanking around each candidate site using both fwd and rev files

	#for each candidate site, grab chromosome and position
	old_chrom='null'
	while read chrom bp #grab chromosome and bp position
	do
		if [ $chrom = $old_chrom ]; then
			:
		else
			#grab position and depth from selected chromosome
			grep ^$chrom ${filename}.good.fastq.sorted.fwd.depth | cut -f 2-3 > ${filename}.plot_forward
			grep ^$chrom ${filename}.good.fastq.sorted.rev.depth | cut -f 2-3  > ${filename}.plot_reverse
			grep ^$chrom ${filename}.fwdHitSites.txt | cut -f 2-3  > ${filename}.fwdHits.txt
			grep ^$chrom ${filename}.revHitSites.txt | cut -f 2-3  > ${filename}.revHits.txt
			old_chrom=$chrom
		fi
		#plot tview
		let start=bp-10
		samtools tview -d T -p $chrom:$start ${filename}.good.fastq.sorted.bam > ${filename}.$chrom.$bp.tview.txt

		#grab 1kb up and downstream of site from for.depth and rev.depth files
		let start=bp-1000
		let stop=bp+1001

		gnuplot -p -e "set terminal postscript color ;\
		set xrange [$start:$stop];\
		unset key;\
		plot '${filename}.plot_forward' with dots lw 5 linecolor rgb 'red','${filename}.plot_reverse' with dots lw 5 linecolor rgb 'blue',\
		'${filename}.fwdHits.txt' with points pointtype 1 linecolor rgb '#F08080',\
		'${filename}.revHits.txt' with points pointtype 1 linecolor rgb '#00CED1'"> ${filename}.$chrom.$bp.gnuplot.ps

	done < ${filename}.HitSites.txt.candidate_sites

# Comment out following line for troubleshooting
	rm ${filename}.good* ${filename}.*HitSites.* ${filename}.*.sam ${filename}.plot* ${filename}.*Hits.txt

done