Skip to content

prio-data/endogenr

Repository files navigation

endogenr

The goal of endogenr is to make it easy to simulate dynamic systems from regression models, mathematical equations, and exogenous inputs (either based on a stochastic distribution, or given by some data). We assume a panel-data structure, with two variables denoting the time and the space dimensions.

The simulator works by identifying the dependency graph of the models added to the system, and deriving the order of calculation based on this graph.

It works with parallel simulation using the future::multisession approach.

Installation

You can install the development version of endogenr from GitHub with:

# install.packages("pak")
pak::pak("prio-data/endogenr")
# alternatively with
renv::install("prio-data/endogenr")

You can also clone (download) the repository, open it as a project in RStudio, find the “Build” tab, and press “Install”.

Example

library(endogenr)
library(fable)
#> Loading required package: fabletools
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df <- endogenr::example_data
df <- tsibble::as_tsibble(df, key = "gwcode", index = "year")

e1 <- gdppc_grwt ~ lag(yjbest) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(psecprop) + lag(zoo::rollmean(gdppc_grwt, k = 3, fill = NA, align = "right"))
c1 <- yjbest ~ lag(yjbest) + lag(log(gdppc)) + lag(log(population)) + lag(psecprop) + lag(dem) + lag(gdppc_grwt) + lag(zoo::rollmean(yjbest, k = 5, fill = NA, align = "right"))
d1 <- dem ~ lag(dem) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(yjbest) + lag(psecprop) + lag(zoo::rollmean(dem, k = 3, fill = NA, align = "right"))

model_system <- list(
  build_model("deterministic",formula = gdppc ~ I(abs(lag(gdppc)*(1+gdppc_grwt)))),
  build_model("deterministic", formula = gdp ~ I(abs(gdppc*population))),
  build_model("parametric_distribution", formula = ~gdppc_grwt, distribution = "norm"),
  build_model("linear", formula = c1, boot = "resid"),
  build_model("univariate_fable", formula = dem ~ error("A") + trend("N") + season("N"), method = "ets"),
  build_model("exogen", formula = ~psecprop),
  build_model("exogen", formula = ~population)
)

simulator_setup <- setup_simulator(models = model_system,
                                   data = df,
                                   train_start = 1970,
                                   test_start = 2010,
                                   horizon = 12,
                                   groupvar = "gwcode",
                                   timevar = "year",
                                   inner_sims = 2,
                                   min_window = 10)

set.seed(42)
res <- simulate_endogenr(nsim = 2, simulator_setup = simulator_setup, parallel = F)

# Example that you can back-transform variables you have simulated as transformed variables
scaled_logit <- function(x, lower=0, upper=1){
  log((x-lower)/(upper-x))
}
inv_scaled_logit <- function(x, lower=0, upper=1){
  (upper-lower)*exp(x)/(1+exp(x)) + lower
}
my_scaled_logit <- fabletools::new_transformation(scaled_logit, inv_scaled_logit)

yj <- scales::transform_yj(p = 0.4) # This was transformed using lambda 0.4 when creating the data.
# Back-transform
res <- res |> dplyr::mutate(v2x_libdem = inv_scaled_logit(dem),
                            best = yj$inverse(yjbest))

res <- tsibble::tsibble(res, key = c(simulator_setup$groupvar, ".sim"), index = simulator_setup$timevar) |>
  dplyr::filter(year >= simulator_setup$test_start)

acc <- get_accuracy(res, "gdppc_grwt", df)
#> Warning: The dimnames of the fable's distribution are missing and have been set
#> to match the response variables.
acc |> summarize(across(crps:winkler, ~ mean(.x))) |> arrange(crps) |> knitr::kable()
crps mae winkler
0.034463 0.0430068 0.149843
plotsim(res, "gdppc", c(2, 20, 530), df)

plotsim(res, "gdppc_grwt", c(2, 20, 530), df)

plotsim(res, "v2x_libdem", c(2, 20, 530), df)

plotsim(res, "best", c(2, 20, 530), df)

Spatial Lag

endogenr supports spatial lag variables in the simulation. The setup involves two steps: (1) computing the spatial lag for the historical data, and (2) registering a spatial_lag model in the model system so the lag is recomputed at each simulated time step.

Step 1: Prepare the spatial weights and compute the historical lag

The convenience function st_weights_from_sf creates a neighborhood list and weights from an sf object. It defaults to queen contiguity. For more advanced weight schemes, use the sfdep package directly and supply the neighborhood list (nb), weights (wt), and the unit IDs in the order they appear in the spatial object.

Note that computing the spatial lag for the historical data can be fiddly when the panel is unbalanced or has missing observations. The example below filters the data to units present in the neighborhood structure before computing the lag.

library(endogenr)
library(dplyr)

df <- endogenr::example_data
df <- tsibble::as_tsibble(df, key = "gwcode", index = "year")

# Load a map and filter to units present in the data
map <- poldat::cshp_gw_modifications(france_overseas = FALSE) |>
  dplyr::filter(end == as.Date("2019-12-31"))
map <- map |> dplyr::filter(gwcode %in% unique(df$gwcode))

# Build spatial weights (queen contiguity by default)
sf::sf_use_s2(FALSE)
neigh <- st_weights_from_sf(map, "gwcode", weights_args = list(allow_zero = TRUE))

# Compute the spatial lag of yjbest for each year in the historical data
df <- df |>
  dplyr::filter(gwcode %in% neigh$unit_ids) |>
  as_tibble() |>
  group_by(year) |>
  dplyr::mutate(
    sl_yjbest = sfdep::st_lag(yjbest, neigh$nb, neigh$wt, allow_zero = TRUE)
  )

df <- tsibble::tsibble(df, key = "gwcode", index = "year")

Step 2: Add a spatial_lag model to the system

A spatial_lag model works like a deterministic model — it is a transformation applied at each simulated time step t. This ensures the spatial lag variable is updated as the underlying variable evolves during simulation.

One important consideration is how the spatial lag variable enters other model formulas. Using yjbest ~ sl_yjbest will not work because sl_yjbest is computed in the same period as yjbest, creating a circular dependency. Instead, use lag(sl_yjbest) so the conflict model uses the spatial lag from the previous period.

e1 <- gdppc_grwt ~ lag(yjbest) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(psecprop) +
  lag(zoo::rollmean(gdppc_grwt, k = 3, fill = NA, align = "right"))
c1 <- yjbest ~ lag(yjbest) + lag(sl_yjbest) + lag(log(gdppc)) + lag(log(population)) +
  lag(psecprop) + lag(dem) + lag(gdppc_grwt) +
  lag(zoo::rollmean(yjbest, k = 5, fill = NA, align = "right"))
d1 <- dem ~ lag(dem) + lag(gdppc_grwt) + lag(log(gdppc)) + lag(yjbest) + lag(psecprop) +
  lag(zoo::rollmean(dem, k = 3, fill = NA, align = "right"))

model_system <- list(
  # spatial_lag recomputes sl_yjbest from yjbest at each simulated t
  build_model("spatial_lag", formula = sl_yjbest ~ yjbest,
              nb = neigh$nb, wt = neigh$wt, unit_ids = neigh$unit_ids,
              island_default = 0),
  build_model("deterministic", formula = gdppc ~ I(abs(lag(gdppc) * (1 + gdppc_grwt)))),
  build_model("deterministic", formula = gdp ~ I(abs(gdppc * population))),
  build_model("parametric_distribution", formula = ~gdppc_grwt, distribution = "norm"),
  build_model("linear", formula = c1, boot = "resid"),
  build_model("univariate_fable", formula = dem ~ error("A") + trend("N") + season("N"),
              method = "ets"),
  build_model("exogen", formula = ~psecprop),
  build_model("exogen", formula = ~population)
)

simulator_setup <- setup_simulator(
  models = model_system,
  data = df,
  train_start = 1970,
  test_start = 2010,
  horizon = 12,
  groupvar = "gwcode",
  timevar = "year",
  inner_sims = 2,
  min_window = 10
)

set.seed(42)
res <- simulate_endogenr(nsim = 2, simulator_setup = simulator_setup, parallel = FALSE)

About

A dynamic endogenous simulator of statistical models in R

Resources

License

Stars

Watchers

Forks

Packages

 
 
 

Contributors

Languages