-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path01_CreateStylizedData.R
More file actions
50 lines (42 loc) · 1.86 KB
/
01_CreateStylizedData.R
File metadata and controls
50 lines (42 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Create a dataset styled after our industry lapse data
# This dataset contains no true records from our industry data,
# but the lapse response variable (Surr) has relationships with the
# predictor variables that approximate those found in said industry data.
# library(dplyr)
# How many quarterly records?
num.records <- 200000
# Create covariates
set.seed(1)
sampledata <- data.frame(CSV = rexp(num.records, 1/120000) + 1000,
q = pmin(55, ceiling(rexp(num.records, 1/12))),
SCPeriod = 7,
ITM = rnorm(num.records, 1.2, 0.15),
DistCode = as.factor(ceiling(rexp(num.records, 1/50)))) %>%
mutate(IN = ifelse(q <= SCPeriod*4, 1, 0),
Dur_IN = pmin(0, (q - SCPeriod*4)/(SCPeriod*4)),
Dur_OUT = pmax(0, q - SCPeriod*4),
GLWBBenBase = ITM/CSV)
# Set distributor effects
distributors <- data.frame(DistCode = unique(sampledata$DistCode),
Effect = rnorm(length(unique(sampledata$DistCode)), -4, 0.25))
# Randomly generate surrender response
modeldata <- sampledata %>%
left_join(distributors,
by = "DistCode") %>%
mutate(surr_logodds = -1.3*IN + 1.6*ITM + 4.6*Dur_IN - 0.3*Dur_OUT - 2.6*ITM*Dur_IN + 0.15*ITM*Dur_OUT + Effect)
modeldata <- modeldata %>%
mutate(Surr = ifelse(runif(n()) < 1 / (1 + exp(-surr_logodds)), 1, 0))
# Create holdout subset ####
# Guarantee that each distributor has observations in each subset
set.seed(2)
modeldata <- modeldata %>%
group_by(DistCode) %>%
mutate(Sample = ifelse(rep(n(), n()) == 1,
"training",
ifelse(row_number() %in% sample(1:n(), ceiling(n()/2), replace = F),
"training",
"holdout"))) %>%
ungroup()
# Save data
saveRDS(modeldata,
file = "SampleDataset.RDS")