mimbpostproc/01-prepare_data.Rmd at master · coregenomics/mimbpostproc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Prepare data

Fetch accession
[GSE85337](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85337)
reads and regulatory regions.

## Import reads

```{r Import Human 3 untreated dataset, cache = TRUE}
suppressPackageStartupMessages({
  library(Biobase)     # cache
  library(rtracklayer) # import
})

## Human 3 untreated
strand <- c("plus", "minus")
acc <- "GSM2265100"
files <- paste0(acc, "_H3-U_", strand, ".bw")
urls <- paste0("https://www.ncbi.nlm.nih.gov/geo/download/?acc=", acc,
               "&format=file&file=", files)
names(urls) <- files
stopifnot(lengths(urls) == lengths(files))
## Download files
for (file in files)
  if (! file.exists(file))
    download.file(urls[file], file)
## Read in the bigwig stranded files.
reads_h3u <- sapply(files, import)
## Assign strand.
reads_h3u <- mendoapply(`strand<-`, reads_h3u, value = c("+", "-"))
## Merge files into single GRanges.
reads_h3u <- as(reads_h3u, "GRangesList")
reads_h3u <- unlist(reads_h3u)
names(reads_h3u) <- NULL
score(reads_h3u) <- abs(score(reads_h3u))
## Ensure scores were assigned to single bases.
stopifnot(all(lengths(reads_h3u) == 1))
## Assign genome.
cache(hg19 <- Seqinfo(genome = "hg19"))
seqinfo(reads_h3u) <- hg19[seqlevels(reads_h3u)]
## Remove chrM.
reads_h3u <- dropSeqlevels(reads_h3u, "chrM", pruning.mode = "coarse")
## Save result.
reads_h3u
save(reads_h3u, file = "reads_h3u.RData")
```

## Generate TREs from dREG calls

Ordinarily, one can use the dREG regions as the transcription
regulatory elements (TREs) directly.

However, the dREG calls in this dataset are slightly unusual; they are
single bases instead of ranges to allow for CrossMap conversions
across species.  We have to run a few extra steps mentioned [in the
paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5957490/#S17title) to
obtain TRE ranges from the single base calls.

```{r Generate Human untreated TRE calls from dREG, cache = TRUE}
suppressPackageStartupMessages({
  library(Biobase)     # cache
  library(rtracklayer) # import
})

acc <- "GSE85337"
file <- paste0(acc, "_H-U.TSS.bw")
url <- paste0("https://www.ncbi.nlm.nih.gov/geo/download/?acc=", acc,
              "&format=file&file=", file)
if (! file.exists(file))
    download.file(url, file)
dreg <- import(file)
## Assign genome.
cache(hg19 <- Seqinfo(genome = "hg19"))
seqinfo(dreg) <- hg19[seqlevels(dreg)]
## Remove chrM.
dreg <- dropSeqlevels(dreg, "chrM", pruning.mode = "coarse")
## Subest TREs using 0.3 threshold per online methods of
## doi:10.1038/s41559-017-0447-5
dreg <- dreg[score(dreg) > 0.3]
## Merge sites within 500 bp per online methods.
hits <- distanceToNearest(dreg)
hits <- hits[mcols(hits)$distance <= 500]
ranges <- as(start(List(ranges(dreg[from(hits)]),
                        ranges(dreg[to(hits)]))), "matrix")
tre_merged <- GRanges(ranges =
                          IRanges(start = apply(ranges, 2, min),
                                  end = apply(ranges, 2, max)),
                      seqnames = seqnames(dreg[from(hits)]))
tre_merged <- reduce(tre_merged) # Remove duplicates.
## Add remaining sites with no neighbors within 500 bp.
dreg_far <- dreg[- from(hits)]
score(dreg_far) <- NULL
tre <- sort(c(tre_merged, dreg_far))
## Save result.
tre
save(tre, file = "tre.RData")
```

## Import mappable file

The unique mappable bedgraph file `hg19_30mers_unique.bedgraph` was
generated on another computer and has been uploaded to Zenodo
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3236288.svg)](https://doi.org/10.5281/zenodo.3236288)


```{r Import precomputed hg19 30-mer mappable file, cache = TRUE}
suppressPackageStartupMessages({
  library(rtracklayer) # import
})

url <-"https://zenodo.org/record/3236288/files/hg19_30mers_unique.bedgraph.gz"
file <- basename(url)
if (! file.exists(file))
    download.file(url, file)

hg19_mappable_all <- import(file)
seqinfo(hg19_mappable_all) <- hg19[seqlevels(hg19_mappable_all)]
## Save result.
hg19_mappable_all
save(hg19_mappable_all, file = "hg19_mappable_all.RData")
## Split by chromosome.
grl <- split(hg19_mappable_all, seqnames(hg19_mappable_all))
## Sum bases by chromosome.
hg19_mappable_with_chrM <- sum(width(grl))
## Remove chrM.
chromosomes_no_chrM <- setdiff(names(hg19_mappable_with_chrM), "chrM")
hg19_mappable <- hg19_mappable_with_chrM[chromosomes_no_chrM]
## Save result.
hg19_mappable
save(hg19_mappable, file = "hg19_mappable.RData")
```

## Import timecourse reads

```{r Import timecourse reads, cache = TRUE}
suppressPackageStartupMessages({
  library(GenomicAlignments)
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  library(org.Hs.eg.db)
})

## Example chr7 reads.
bams_chr7_unsorted <-
    list.files(system.file(package = "groHMM", "extdata"),
               "*R1.bam",
               full.names = TRUE)
bams_chr7 <- sub(".bam", "-sorted.bam", bams_chr7_unsorted)
## groHMM doesn't come with a BAM index, so create one.
mapply(sortBam,
       bams_chr7_unsorted,
       tools::file_path_sans_ext(bams_chr7))
indexBam(bams_chr7)
## Read the small BAMs into memory for polymeraseWave, groHMM, etc.
reads_chr7 <-
    as(
        lapply(bams_chr7,
               function(file)
                   sort(
                       as(readGAlignments(file),
                          "GRanges"))),
        "List")
names(reads_chr7) <- c(0, 40) # Timepoint in minutes.
metadata(reads_chr7) <- list(files = setNames(bams_chr7, names(reads_chr7)))

## Convert raw reads to histogram.
split_by_strand <- function(gr) {
    group <- droplevels(strand(gr))
    as(split(gr, group), "GRangesList")
}
gr_coverage <- function(reads) {
    stopifnot(length(runValue(strand(reads))) == 1)
    gr <- as(coverage(reads), "GRanges")
    strand(gr) <- runValue(strand(reads))
    gr
}
hist_chr7 <-
    as(sapply(reads_chr7,
              function (gr) {
                  grl <- split_by_strand(gr)
                  hist <- unlist(as(lapply(grl, gr_coverage),
                                    "GRangesList"),
                             use.names = FALSE)
                  seqinfo(hist) <- seqinfo(gr)
                  browser()
                  hist[score(hist) > 0]
              }),
       "GRangesList")

## Create gene list.
genes <- keepStandardChromosomes(
    setNames(
        sort(
            genes(TxDb.Hsapiens.UCSC.hg19.knownGene)),
       NULL),
    pruning.mode = "coarse")
## Add column of the common gene symbol.
mcols(genes)$symbol <- select(org.Hs.eg.db,
                              keys = genes$gene_id,
                              keytype = "ENTREZID",
                              columns = "SYMBOL")$SYMBOL
genes
range <- range(unlist(reads_chr7))
genes <- subsetByOverlaps(genes, range)
genes

## Find a long gene with lots of counts.
##
## Add counts and densities.
for (time in names(reads_chr7)) {
    mcols(genes)[paste0("counts_", time)] <-
        countOverlaps(genes, reads_chr7[[time]])
    mcols(genes)[paste0("rpkm_", time)] <-
        mcols(genes)[paste0("counts_", time)][[1]] / width(genes)
}
mcols(genes)[paste0("rpkm_ratio")] <-
    mcols(genes)[paste0("counts_40")][[1]] /
    mcols(genes)[paste0("counts_0")][[1]]

## Filter data.frame.
df <- as.data.frame(genes)
df <- subset(df,
             width > 5000 &
             width < 40000 &
             counts_0 > 0 &
             counts_40 > 0 &
             counts_40 > counts_0)
## Sort by rpkm
df <- df[with(df, order(-rpkm_40)), ]
head(df[,-c(1:3, 6,9)], 100)

gene <- genes[genes$symbol %in% "ATP6V1FNB"]

## Save results.
save(genes, file = "genes.RData")
save(gene, file = "gene.RData")
save(reads_chr7, file = "reads_chr7.RData")
save(hist_chr7, file = "hist_chr7.RData")
```