The goal of endogenr is to make it easy to simulate dynamic systems from regression models, mathematical equations, and exogenous inputs (either based on a stochastic distribution, or given by some data). We assume a panel-data structure, with two variables denoting the time and the space dimensions.
The simulator works by identifying the dependency graph of the models added to the system, and deriving the order of calculation based on this graph.
It works with parallel simulation using the future::multisession approach.
You can install the development version of endogenr from GitHub with:
# install.packages("pak")
pak::pak("prio-data/endogenr")
# alternatively with
renv::install("prio-data/endogenr")You can also clone (download) the repository, open it as a project in RStudio, find the “Build” tab, and press “Install”.
library(endogenr)
library(fable)
#> Loading required package: fabletools
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- endogenr::example_data
df <- tsibble::as_tsibble(df, key = "gwcode", index = "year")
e1 <- gdppc_grwt ~ lag(yjbest) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(psecprop) + lag(zoo::rollmean(gdppc_grwt, k = 3, fill = NA, align = "right"))
c1 <- yjbest ~ lag(yjbest) + lag(log(gdppc)) + lag(log(population)) + lag(psecprop) + lag(dem) + lag(gdppc_grwt) + lag(zoo::rollmean(yjbest, k = 5, fill = NA, align = "right"))
d1 <- dem ~ lag(dem) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(yjbest) + lag(psecprop) + lag(zoo::rollmean(dem, k = 3, fill = NA, align = "right"))
model_system <- list(
build_model("deterministic",formula = gdppc ~ I(abs(lag(gdppc)*(1+gdppc_grwt)))),
build_model("deterministic", formula = gdp ~ I(abs(gdppc*population))),
build_model("parametric_distribution", formula = ~gdppc_grwt, distribution = "norm"),
build_model("linear", formula = c1, boot = "resid"),
build_model("univariate_fable", formula = dem ~ error("A") + trend("N") + season("N"), method = "ets"),
build_model("exogen", formula = ~psecprop),
build_model("exogen", formula = ~population)
)
simulator_setup <- setup_simulator(models = model_system,
data = df,
train_start = 1970,
test_start = 2010,
horizon = 12,
groupvar = "gwcode",
timevar = "year",
inner_sims = 2,
min_window = 10)
set.seed(42)
res <- simulate_endogenr(nsim = 2, simulator_setup = simulator_setup, parallel = F)
# Example that you can back-transform variables you have simulated as transformed variables
scaled_logit <- function(x, lower=0, upper=1){
log((x-lower)/(upper-x))
}
inv_scaled_logit <- function(x, lower=0, upper=1){
(upper-lower)*exp(x)/(1+exp(x)) + lower
}
my_scaled_logit <- fabletools::new_transformation(scaled_logit, inv_scaled_logit)
yj <- scales::transform_yj(p = 0.4) # This was transformed using lambda 0.4 when creating the data.
# Back-transform
res <- res |> dplyr::mutate(v2x_libdem = inv_scaled_logit(dem),
best = yj$inverse(yjbest))
res <- tsibble::tsibble(res, key = c(simulator_setup$groupvar, ".sim"), index = simulator_setup$timevar) |>
dplyr::filter(year >= simulator_setup$test_start)
acc <- get_accuracy(res, "gdppc_grwt", df)
#> Warning: The dimnames of the fable's distribution are missing and have been set
#> to match the response variables.
acc |> summarize(across(crps:winkler, ~ mean(.x))) |> arrange(crps) |> knitr::kable()| crps | mae | winkler |
|---|---|---|
| 0.034463 | 0.0430068 | 0.149843 |
plotsim(res, "gdppc", c(2, 20, 530), df)plotsim(res, "gdppc_grwt", c(2, 20, 530), df)plotsim(res, "v2x_libdem", c(2, 20, 530), df)plotsim(res, "best", c(2, 20, 530), df)endogenr supports spatial lag variables in the simulation. The setup
involves two steps: (1) computing the spatial lag for the historical
data, and (2) registering a spatial_lag model in the model system so
the lag is recomputed at each simulated time step.
The convenience function st_weights_from_sf creates a neighborhood
list and weights from an sf object. It defaults to queen contiguity.
For more advanced weight schemes, use the sfdep package directly and
supply the neighborhood list (nb), weights (wt), and the unit IDs in
the order they appear in the spatial object.
Note that computing the spatial lag for the historical data can be fiddly when the panel is unbalanced or has missing observations. The example below filters the data to units present in the neighborhood structure before computing the lag.
library(endogenr)
library(dplyr)
df <- endogenr::example_data
df <- tsibble::as_tsibble(df, key = "gwcode", index = "year")
# Load a map and filter to units present in the data
map <- poldat::cshp_gw_modifications(france_overseas = FALSE) |>
dplyr::filter(end == as.Date("2019-12-31"))
map <- map |> dplyr::filter(gwcode %in% unique(df$gwcode))
# Build spatial weights (queen contiguity by default)
sf::sf_use_s2(FALSE)
neigh <- st_weights_from_sf(map, "gwcode", weights_args = list(allow_zero = TRUE))
# Compute the spatial lag of yjbest for each year in the historical data
df <- df |>
dplyr::filter(gwcode %in% neigh$unit_ids) |>
as_tibble() |>
group_by(year) |>
dplyr::mutate(
sl_yjbest = sfdep::st_lag(yjbest, neigh$nb, neigh$wt, allow_zero = TRUE)
)
df <- tsibble::tsibble(df, key = "gwcode", index = "year")A spatial_lag model works like a deterministic model — it is a
transformation applied at each simulated time step t. This ensures the
spatial lag variable is updated as the underlying variable evolves
during simulation.
One important consideration is how the spatial lag variable enters
other model formulas. Using yjbest ~ sl_yjbest will not work because
sl_yjbest is computed in the same period as yjbest, creating a
circular dependency. Instead, use lag(sl_yjbest) so the conflict model
uses the spatial lag from the previous period.
e1 <- gdppc_grwt ~ lag(yjbest) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(psecprop) +
lag(zoo::rollmean(gdppc_grwt, k = 3, fill = NA, align = "right"))
c1 <- yjbest ~ lag(yjbest) + lag(sl_yjbest) + lag(log(gdppc)) + lag(log(population)) +
lag(psecprop) + lag(dem) + lag(gdppc_grwt) +
lag(zoo::rollmean(yjbest, k = 5, fill = NA, align = "right"))
d1 <- dem ~ lag(dem) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(yjbest) + lag(psecprop) +
lag(zoo::rollmean(dem, k = 3, fill = NA, align = "right"))
model_system <- list(
# spatial_lag recomputes sl_yjbest from yjbest at each simulated t
build_model("spatial_lag", formula = sl_yjbest ~ yjbest,
nb = neigh$nb, wt = neigh$wt, unit_ids = neigh$unit_ids,
island_default = 0),
build_model("deterministic", formula = gdppc ~ I(abs(lag(gdppc) * (1 + gdppc_grwt)))),
build_model("deterministic", formula = gdp ~ I(abs(gdppc * population))),
build_model("parametric_distribution", formula = ~gdppc_grwt, distribution = "norm"),
build_model("linear", formula = c1, boot = "resid"),
build_model("univariate_fable", formula = dem ~ error("A") + trend("N") + season("N"),
method = "ets"),
build_model("exogen", formula = ~psecprop),
build_model("exogen", formula = ~population)
)
simulator_setup <- setup_simulator(
models = model_system,
data = df,
train_start = 1970,
test_start = 2010,
horizon = 12,
groupvar = "gwcode",
timevar = "year",
inner_sims = 2,
min_window = 10
)
set.seed(42)
res <- simulate_endogenr(nsim = 2, simulator_setup = simulator_setup, parallel = FALSE)


