-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprep_normality.R
More file actions
84 lines (69 loc) · 2 KB
/
prep_normality.R
File metadata and controls
84 lines (69 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
rm(list = ls())
# testing for normality
# reference: http://www.sthda.com/english/wiki/normality-test-in-r
install.packages("dplyr")
if(!require(devtools)) install.packages("devtools")
devtools::install_github("kassambara/ggpubr")
library("dplyr")
library("ggpubr")
datasrc <- "GA_GEN.csv"
my_data <- read.csv(datasrc)
row.names(my_data) <- my_data$Probname
my_data <- my_data[2:length(my_data)]
my_data
varnames <- colnames(my_data)
varnames
# display random sample of 10 rows
set.seed(1234)
dplyr::sample_n(my_data, 10)
# The central limit theorem: no matter what distribution things have,
# the sampling distribution tends to be normal i the sample os large
# enough (n>30)
n <- length(my_data[,1])
n
# creating output folders
mainDir <- "Output"
dir.create(mainDir, showWarnings = FALSE)
# checking normality
# visual
# QQ-plot - if all points fall approx. on reference line, we can
# assume normality
# creates the qqplot output - j is the index of column
gen_qqplot <- function(outDir, filename, my_data, j){
jpeg(file.path(mainDir, outDir, filename))
qq <-ggqqplot(my_data[,j])
print(qq)
dev.off()
}
#gen_qqplot(outDir, "yolo.jpg", my_data, 1)
tstamp <- as.numeric(Sys.time(), units="secs")
outDir <- paste("QQPLOT", datasrc, toString(tstamp), sep="_" )
dir.create(file.path(mainDir, outDir))
outDir
for (j in 1:length(my_data)){
fname <- paste("qqplot", varnames[j], "jpg", sep=".")
gen_qqplot(outDir, fname, my_data, j)
}
# normality test
# shapiro-test: if p-value > 0.05, then the distribution
# of data are not significantly different from normal
# distribution - so we can assume normality
# log transformation
log_my_data <- my_data
for (j in 1:length(my_data)) {
print(varnames[j])
col_log10 <- log10(my_data[, j])
log_my_data[,j] <- col_log10
}
log_my_data
for (j in 1:length(my_data)){
print(varnames[j])
s<-shapiro.test(my_data[,j])
print(s)
}
# normality test on log-transformed data
for (j in 1:length(log_my_data)){
print(varnames[j])
s<-shapiro.test(log_my_data[,j])
print(s)
}