-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path03_PenalizedGLM_ModelExample.R
More file actions
104 lines (85 loc) · 4.62 KB
/
03_PenalizedGLM_ModelExample.R
File metadata and controls
104 lines (85 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Fit the same model, but shrink the distributor codes using penalization
# Use the base GLM fit in the 02_LME_ModelExample script as an offset for this example
# library(glmnet)
# Read data/model ####
glm.model.base <- readRDS("IgnoreList/GLM_base_ModelObject.RDS")
modeldata <- readRDS("SampleDataset.RDS")
# Note that the offset should be in the units of the link response (i.e. logodds, not probabilities)
modeldata[["offset"]] <- predict(glm.model.base, modeldata, type = "link")
# Prepare model matrix ####
# Consider using dummyVars() function from caret package
x.train <- model.matrix(object = ~ -1 + DistCode,
data = modeldata %>%
filter(Sample == "training"),
contrasts.arg = contrasts(modeldata$DistCode[modeldata$Sample == "training"],
contrasts = FALSE))
x.holdout <- model.matrix(object = ~ -1 + DistCode,
data = modeldata %>%
filter(Sample == "holdout"),
contrasts.arg = contrasts(modeldata$DistCode[modeldata$Sample == "holdout"],
contrasts = FALSE))
# Fit penalized GLM ####
# Elastic net fit
# Takes 5 - 6 minutes
system.time(
cv.glmnet.model <- cv.glmnet(x = x.train,
y = modeldata$Surr[modeldata$Sample == "training"],
family = "binomial",
standardize = T,
intercept = T,
alpha = 0.5, # 50% LASSO and 50% Ridge penalties
nfolds = 5,
offset = modeldata$offset[modeldata$Sample == "training"])
)
plot.cv.glmnet(cv.glmnet.model, sign.lambda=-1)
# Summarize coefficients ####
C <- coef(cv.glmnet.model, s = "lambda.min")
# Intercept
C[setdiff(c(1:nrow(C)), grep("DistCode", rownames(C))),]
# Summary of distributor coefficients
summary(C[grep("DistCode", rownames(C)),])
# Alpha tuning ####
# Modelers often tune the alpha parameter, which weights the LASSO and Ridge penalties
# Here is a simple version of that weighting, run in parallel
cv.models <- vector("list", 3)
i <- 1
system.time({
for(alpha.parm in c(0, 0.5, 1)){
cl <- makeCluster(5) # 5 cores in parallel
registerDoParallel(cl)
cv.models[[i]] <- cv.glmnet(x = x.train,
y = modeldata$Surr[modeldata$Sample == "training"],
family = "binomial",
standardize = T,
intercept = T,
alpha = alpha.parm,
nfolds = 5,
parallel = T,
offset = modeldata$offset[modeldata$Sample == "training"])
i <- i + 1
stopCluster(cl)
gc()
}
})
# Model comparison on holdout ####
modeldata[["preds.alpha0"]][modeldata$Sample == "holdout"] <- predict(cv.models[[1]],
x.holdout,
type = "response",
newoffset = modeldata$offset[modeldata$Sample == "holdout"],
s = "lambda.min")
modeldata[["preds.alpha50"]][modeldata$Sample == "holdout"] <- predict(cv.models[[2]],
x.holdout,
type = "response",
newoffset = modeldata$offset[modeldata$Sample == "holdout"],
s = "lambda.min")
modeldata[["preds.alpha100"]][modeldata$Sample == "holdout"] <- predict(cv.models[[3]],
x.holdout,
type = "response",
newoffset = modeldata$offset[modeldata$Sample == "holdout"],
s = "lambda.min")
modeldata %>%
filter(!is.na(preds.alpha25)) %>%
melt(measure.vars = c("preds.alpha0", "preds.alpha50", "preds.alpha100")) %>%
group_by(variable) %>%
summarize(logloss = -sum(Surr*log(value) + (1 - Surr)*log(1 - value)))
# Compare coefficient ranges...