Zuzu/simseq_test.R at main · BenjaminATaylor/Zuzu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
library(SimSeq)
data(kidney)
counts <- kidney$counts # Matrix of read counts from KIRC dataset
replic <- kidney$replic # Replic vector indicating paired columns
treatment <- kidney$treatment # Treatment vector indicating Non-Tumor or Tumor columns
nf <- apply(counts, 2, quantile, 0.75)
library(fdrtool)
## Not run:
### Example 1: Simulate Matrix with 1000 DE genes and 4000 EE genes
data.sim <- SimData(counts = counts, replic = replic, treatment = treatment,
                    sort.method = "paired",
                    k.ind = 5,  # = samples per treatment group
                    n.genes = 5000, n.diff = 1000,
                    norm.factors = nf)


#############
# Note that since we haven't specified a weights vector, the simulation will try to weight choice of DE genes based on the degree of differentiation in the real data (as I understand it)

data.simseq = SimData(counts = counts.true,
                      #samp.independent = TRUE,
                      treatment = metadata.true$phenotype,
                      sort.method = "unpaired",
                      switch.trt = FALSE,
                      k.ind = floor(min(table(metadata.true$phenotype))/2),  # = samples per treatment group
                      n.genes = nrow(counts.true),
                      n.diff = floor(nrow(counts.true)*0.1))


# An annoying feature of this simulation is that the maximum sample size that it permits in the synthetic data is half the size of the reference group in the real data, which means e.g. for a 60-sample dataset we can only get a 30-sample simulation. This is a bit of a pain, but I think we can work around it by just running the simulation multiple times and concatenating the results?

############


### Example 2: Calculate weights vector beforehand to save run time in repeated simulations
sort.list <- SortData(counts = counts, treatment = treatment, replic = replic,
                      sort.method = "paired", norm.factors = nf)
counts <- sort.list$counts # Looks like this just sorts, no trimming that I can see despite the documentation
replic <- sort.list$replic
treatment <- sort.list$treatment
nf <- sort.list$norm.factors
probs <- CalcPvalWilcox(counts, treatment, sort.method = "paired",
                        sorted = TRUE, norm.factors = nf, exact = FALSE)
weights <- 1 - fdrtool(probs, statistic = "pvalue", plot = FALSE, verbose = FALSE)$lfdr
# Okay so it looks to me like the weights here are inverse pvals, such that genes that are more DE in the original data are more likely to be selected as DE in the simulation
data.sim.2 <- SimData(counts = counts, replic = replic, treatment = treatment,
                    sort.method = "paired", k.ind = 5, n.genes = 5000, n.diff = 1000,
                    weights = weights, norm.factors = nf)

### Example 3: Specify which genes you want to use in the simulation
# Randomly sample genes or feed in the exact genes you wish to use
genes.diff <- sample(1:nrow(counts), size = 1000, prob = weights)
genes <- c(sample(1:nrow(counts)[-genes.diff], 4000), genes.diff)
data.sim <- SimData(counts = counts, replic = replic, treatment = treatment,
                    sort.method = "paired", k.ind = 5, genes.select = genes,
                    genes.diff = genes.diff, weights = weights, norm.factors = nf)
### Example 4: Simulate matrix with DE genes having log base 2 fold change greater than 1
# add one to counts matrix to avoid infinities when taking logs
tumor.mean <- rowMeans(log2((counts[, treatment == "Tumor"] + 1) %*%
                              diag(1/nf[treatment == "Tumor"])))
nontumor.mean <- rowMeans(log2((counts[, treatment == "Non-Tumor"] + 1) %*%
                                 diag(1/nf[treatment == "Non-Tumor"])))

lfc <- tumor.mean - nontumor.mean
weights.zero <- abs(lfc) < 1
weights[weights.zero] <- 0
data.sim <- SimData(counts = counts, replic = replic, treatment = treatment,
                    sort.method = "paired", k.ind = 5, n.genes = 5000, n.diff = 1000,
                    weights = weights, norm.factors = nf)
### Example 5: Simulate three treatment groups:
### 3 Different types of Differential Expression Allowed
### First Group Diff, Second and Third group Equal
### Second Group Diff, First and Third group Equal
### Third Group Diff, First and Second group Equal
k <- 5 # Sample Size in Each treatment group
### Sample DE genes beforehand
N <- nrow(counts)
genes.de <- sample(1:N, size = 1000, prob = weights) # Sample all DE genes
DE1 <- genes.de[1:333] # Sample DE genes with first trt diff
DE2 <- genes.de[334:666] # Sample DE genes with sec trt diff
DE3 <- genes.de[667:1000] # Sample DE genes with third trt diff
EE <- sample( (1:N)[-genes.de], size = 4000) #Sample EE genes

genes.tot <- c(EE, genes.de)
genes.de1 <- union(DE2, EE) #Assign DE genes for first sim
genes.de2 <- union(DE2, DE3) #Assign DE genes for second sim
data.sim1 <- SimData(counts = counts, replic = replic, treatment = treatment,
                     sort.method = "paired", k.ind = k, genes.select = genes.tot,
                     genes.diff = genes.de1, weights = weights, norm.factors = nf)
#remove pairs of columns used in first simulation
cols.rm <- c(data.sim1$col[1:(2*k)], data.sim1$col[1:(2*k)] + 1)
counts.new <- counts[, -cols.rm]
nf.new <- nf[-cols.rm]
replic.new <- replic[-cols.rm]
treatment.new <- treatment[-cols.rm]
### Set switch.trt = TRUE for second sim
data.sim2 <- SimData(counts = counts.new, replic = replic.new, treatment = treatment.new,
                     sort.method = "paired", k.ind = k, genes.select = genes.tot,
                     genes.diff = genes.de2, weights = weights, norm.factors = nf.new,
                     switch.trt = TRUE)
### Remove first k.ind entries from first sim and combine two count matrices
counts.sim <- cbind(data.sim1$counts[, -(1:k)], data.sim2$counts)
### treatment group levels for simulated matrix
trt.grp <- rep(NA, 5000)
trt.grp[is.element(data.sim1$genes.subset, DE1)] <- "DE_First_Trt"
trt.grp[is.element(data.sim1$genes.subset, DE2)] <- "DE_Second_Trt"
trt.grp[is.element(data.sim1$genes.subset, DE3)] <- "DE_Third_Trt"
trt.grp[is.element(data.sim1$genes.subset, EE)] <- "EE"