diff --git a/DESCRIPTION b/DESCRIPTION index 2c7e9575..5c76d875 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: netdiffuseR Title: Analysis of Diffusion and Contagion Processes on Networks -Version: 1.24.0 +Version: 1.25.0 Authors@R: c( person("George", "Vega Yon", email="g.vegayon@gmail.com", role=c("aut", "cre"), comment=c(ORCID = "0000-0002-3171-0844", what="Rewrite functions with Rcpp, plus new features") @@ -8,7 +8,7 @@ Authors@R: c( person("Thomas", "Valente", email="tvalente@usc.edu", role=c("aut", "cph"), comment=c(ORCID="0000-0002-8824-5816", what="R original code")), person("Anibal", "Olivera Morales", role = c("aut", "ctb"), - comment=c(ORCID="0009-0000-3736-7939", what="Multi-diffusion version")), + comment=c(ORCID="0009-0000-3736-7939", what="Developer from V1.23.0")), person("Stephanie", "Dyal", email="stepharp@usc.edu", role=c("ctb"), comment="Package's first version"), person("Timothy", "Hayes", email="timothybhayes@gmail.com", role=c("ctb"), comment="Package's first version") ) @@ -21,7 +21,7 @@ Description: Empirical statistical analysis, visualization and simulation of 9781881303213>, Myers (2000) , Iyengar and others (2011) , Burt (1987) ; among others. Depends: - R (>= 3.1.1) + R (>= 3.5) License: MIT + file LICENSE LazyData: true Imports: @@ -65,6 +65,7 @@ Collate: 'bass.r' 'bootnet.r' 'citer_environment.R' + 'collapse_timeframes.R' 'data.r' 'degree_adoption_diagnostic.R' 'diffnet-c.R' diff --git a/NAMESPACE b/NAMESPACE index d3a9d6e9..4a6bdac3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,6 +98,7 @@ export(bootnet) export(classify) export(classify_adopters) export(classify_graph) +export(collapse_timeframes) export(compare_matrix) export(cumulative_adopt_count) export(degree_adoption_diagnostic) diff --git a/NEWS.md b/NEWS.md index c3f66257..0421f546 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,18 @@ +# Changes in netdiffuseR version 1.25.0 (2026-03-14) + +* New function `collapse_timeframes()`: aggregates high-resolution or + continuous-time longitudinal edgelists into discrete time windows, ready + for use with `edgelist_to_adjmat()` or `as_diffnet()`. The function contains + parameters such as `binarize`, `cumulative`, and `symmetric` for better control + over the aggregation process. + +* New dataset `epigames` and `epigamesDiffNet`: a simulated epidemic game + network with 594 nodes and 15 time periods from the WKU Epi Games study. + +* Fixed CRAN example error in `round_to_seq()`: `plot(w, x)` replaced with + `plot(w)` to avoid `%||%` operator issue in R 4.4.0+'s `formula.default` + when called via `plot.data.frame()`. + # Changes in netdiffuseR version 1.24.0 (2025-12-09) * New function `degree_adoption_diagnostic()` analyzes the correlation between network diff --git a/R/collapse_timeframes.R b/R/collapse_timeframes.R new file mode 100644 index 00000000..de55719e --- /dev/null +++ b/R/collapse_timeframes.R @@ -0,0 +1,165 @@ +#' Collapse Timeframes in a Longitudinal Edgelist +#' +#' @description +#' Allows users to take a high-resolution or continuous-time longitudinal +#' edgelist and dynamically collapse or discretize it into larger time windows. +#' The output is a shorter, aggregated edgelist ready to be passed into +#' \code{[edgelist_to_adjmat]} or \code{[as_diffnet]}. +#' +#' @param edgelist A \code{data.frame} representing the longitudinal edgelist. +#' @param ego Character scalar. Name of the column representing the ego (sender). +#' @param alter Character scalar. Name of the column representing the alter (receiver). +#' @param timevar Character scalar. Name of the column representing the time variable. +#' @param weightvar Character scalar or \code{NULL}. Name of the column representing +#' the edge weight. If \code{NULL}, the function tallies the number of interactions +#' within the time window as the weight. +#' @param window_size Numeric scalar. The size of the time window to collapse into. +#' @param time_format Character scalar or \code{NULL}. If the time variable is a +#' character or factor, the format passed to \code{as.POSIXct}. +#' For example, \code{"\%d-\%m-\%Y \%H:\%M"}. +#' @param relative_time Logical scalar. If \code{TRUE}, normalizes the binned +#' times into a strict integer sequence starting at 1 (1, 2, 3...). +#' @param binarize Logical scalar. If \code{TRUE}, sets all resulting edge weights to 1. +#' @param cumulative Logical scalar. If \code{TRUE}, edges from previous time windows +#' are carried over to subsequent windows. +#' @param symmetric Logical scalar. If \code{TRUE}, the resulting graph will be +#' symmetrized (i.e., if an edge A->B exists, an edge B->A is added). +#' +#' @return A \code{data.frame} with 4 columns: the ego, the alter, the new collapsed +#' discrete time, and the aggregated weight. +#' +#' @export +#' @examples +#' \dontrun{ +#' # Load the package's hourly dataset +#' load(system.file("data/epigames_raw.rda", package = "netdiffuseR")) +#' +#' # Collapse the hourly edgelist into a daily edgelist (window_size = 24) +#' daily_edgelist <- collapse_timeframes( +#' edgelist = epigames_raw$edgelist, +#' timevar = "time", +#' weightvar = "weight", +#' window_size = 24 +#' ) +#' head(daily_edgelist) +#' } +collapse_timeframes <- function( + edgelist, + ego = "sender", + alter = "receiver", + timevar = "time", + weightvar = NULL, + window_size = 1, + time_format = NULL, + relative_time = TRUE, + binarize = FALSE, + cumulative = FALSE, + symmetric = FALSE) { + # Step 1: Time Column Parsing + time_raw <- edgelist[[timevar]] + + if (is.character(time_raw) || is.factor(time_raw)) { + if (!is.null(time_format)) { + time_raw <- as.numeric(as.POSIXct(as.character(time_raw), format = time_format)) + } else { + time_raw <- as.numeric(as.POSIXct(as.character(time_raw))) + } + } else if (!is.numeric(time_raw) && !is.integer(time_raw)) { + # e.g., Date or POSIXct already + time_raw <- as.numeric(time_raw) + } + + # Check for NAs after conversion + if (any(is.na(time_raw))) { + warning("There are NA values in the parsed time variable.") + } + + # Step 2: Binning / Window Creation + t_min <- min(time_raw, na.rm = TRUE) + # Adding a tiny offset so min time doesn't fall out of bounds or shift unnecessarily + discrete_time <- ceiling((time_raw - t_min + 1e-9) / window_size) + # Ensure the minimum index is 1 at this stage + min_dt <- min(discrete_time, na.rm = TRUE) + if (min_dt < 1) { + discrete_time <- discrete_time - min_dt + 1 + } + + # Step 3: Handling relative_time + if (relative_time) { # e.g. strict sequence 1, 2, 3 + sorted_unique_times <- sort(unique(discrete_time[!is.na(discrete_time)])) + time_map <- stats::setNames(seq_along(sorted_unique_times), sorted_unique_times) + discrete_time <- unname(time_map[as.character(discrete_time)]) + } + + # Create a working data frame to hold things + work_df <- data.frame( + ego_col = edgelist[[ego]], + alter_col = edgelist[[alter]], + time_col = discrete_time + ) + + # Step 4: Aggregation + if (is.null(weightvar)) { + work_df$weight_col <- 1 + } else { + work_df$weight_col <- edgelist[[weightvar]] + } + + # Remove rows with NAs in essential grouping variables + work_df <- work_df[!is.na(work_df$ego_col) & !is.na(work_df$alter_col) & !is.na(work_df$time_col), ] + + agg_df <- stats::aggregate( + weight_col ~ ego_col + alter_col + time_col, + data = work_df, + FUN = sum, + na.rm = TRUE + ) + + # Step 5: Output with 4 clean columns + weight_col_name <- if (is.null(weightvar)) "weight" else weightvar + colnames(agg_df) <- c(ego, alter, timevar, weight_col_name) + + # Step 6: Post-aggregation processing + + # 6.1 Binarize + if (binarize) { + agg_df[[weight_col_name]] <- 1 + } + + # 6.2 Symmetrize + if (symmetric) { + rev_df <- agg_df + rev_df[[ego]] <- agg_df[[alter]] + rev_df[[alter]] <- agg_df[[ego]] + + # Combine and de-duplicate (in case they already existed symmetrically) + agg_df <- unique(rbind(agg_df, rev_df)) + } + + # 6.3 Cumulative + if (cumulative) { + all_periods <- sort(unique(agg_df[[timevar]])) + if (length(all_periods) > 1) { + cumulative_el <- agg_df[agg_df[[timevar]] == all_periods[1], ] + for (t_idx in 2:length(all_periods)) { + t <- all_periods[t_idx] + current <- agg_df[agg_df[[timevar]] == t, ] + prev <- cumulative_el[cumulative_el[[timevar]] == all_periods[t_idx - 1], ] + if (nrow(prev) > 0) { + prev[[timevar]] <- t + } + # Combine current window with previous accumulated edges and de-duplicate + combined <- unique(rbind(current, prev)) + cumulative_el <- rbind(cumulative_el, combined) + } + agg_df <- cumulative_el + } + } + + # Apply standard sort for consistent outputs: time, ego, alter + order_idx <- order(agg_df[[timevar]], agg_df[[ego]], agg_df[[alter]]) + agg_df <- agg_df[order_idx, ] + rownames(agg_df) <- NULL + + return(agg_df) +} diff --git a/R/data.r b/R/data.r index e69ba8c9..4ed3b835 100644 --- a/R/data.r +++ b/R/data.r @@ -778,23 +778,23 @@ NULL # "medInnovationsDiffNet" #' the Brazilian Farmers collected as part of the three country study implemented #' by Everett Rogers (Rogers, Ascroft, & Röling, 1970), and Korean Family Planning #' data collected by researchers at the Seoul National University's School of -#' Public (Rogers & Kincaid, 1981). The table below summarizes the three datasets: -#' -#' \tabular{lccc}{ -#' \tab \bold{Medical Innovation} \tab \bold{Brazilian Farmers} \tab \bold{Korean Family Planning} \cr -#' \emph{Country} \tab USA \tab Brazil \tab Korean \cr -#' \emph{# Respondents} \tab 125 Doctors \tab 692 Farmers \tab 1,047 Women \cr -#' \emph{# Communities} \tab 4 \tab 11 \tab 25 \cr -#' \emph{Innovation} \tab Tetracycline \tab Hybrid Corn Seed \tab Family Planning \cr -#' \emph{Time for Diffusion} \tab 18 Months \tab 20 Years \tab 11 Years \cr -#' \emph{Year Data Collected} \tab 1955-1956 \tab 1966 \tab 1973 \cr -#' \emph{Ave. Time to 50\%} \tab 6 \tab 16 \tab 7 \cr -#' \emph{Highest Saturation} \tab 0.89 \tab 0.98 \tab 0.83 \cr -#' \emph{Lowest Saturation} \tab 0.81 \tab 0.29 \tab 0.44 \cr -#' \emph{Citation} \tab Coleman et al (1966) \tab Rogers et al (1970) \tab Rogers & Kincaid (1981) \cr -#' } -#' -#' All datasets include a column called \emph{study} which is coded as +#' Public (Rogers & Kincaid, 1981). The table below summarizes the datasets: +#' +#' \tabular{lcccc}{ +#' \tab \bold{Medical Innovation} \tab \bold{Brazilian Farmers} \tab \bold{Korean Family Planning} \tab \bold{WKU Epi Games} \cr +#' \emph{Country} \tab USA \tab Brazil \tab Korean \tab USA \cr +#' \emph{# Respondents} \tab 125 Doctors \tab 692 Farmers \tab 1,047 Women \tab 594 Students \cr +#' \emph{# Communities} \tab 4 \tab 11 \tab 25 \tab Multiple groups \cr +#' \emph{Innovation} \tab Tetracycline \tab Hybrid Corn Seed \tab Family Planning \tab Masks/Medicine \cr +#' \emph{Time for Diffusion} \tab 18 Months \tab 20 Years \tab 11 Years \tab 15 Periods \cr +#' \emph{Year Data Collected} \tab 1955-1956 \tab 1966 \tab 1973 \tab Recent \cr +#' \emph{Ave. Time to 50\%} \tab 6 \tab 16 \tab 7 \tab N/A \cr +#' \emph{Highest Saturation} \tab 0.89 \tab 0.98 \tab 0.83 \tab N/A \cr +#' \emph{Lowest Saturation} \tab 0.81 \tab 0.29 \tab 0.44 \tab N/A \cr +#' \emph{Citation} \tab Coleman et al (1966) \tab Rogers et al (1970) \tab Rogers & Kincaid (1981) \tab WKU \cr +#' } +#' +#' All core datasets include a column called \emph{study} which is coded as #' (1) Medical Innovation (2) Brazilian Farmers, (3) Korean Family Planning. #' #' @section Right censored data: @@ -938,3 +938,52 @@ NULL #' @author George G. Vega Yon #' @name fakeEdgelist NULL # "fakeEdgelist" + + +#' Epi Games Dataset +#' +#' @description +#' The WKU Epi Games dataset represents a simulated epidemic or game environment with +#' dynamic encounters over 15 time periods. It provides both node-level +#' attributes and a longitudinal edgelist. +#' +#' @format A list with two data frames: +#' +#' **attributes**: A data frame with 594 rows and 9 variables representing nodes: +#' \describe{ +#' \item{id}{Unique identifier for the participant.} +#' \item{toa}{Time of Adoption (1 to 15), representing when the individual was first infected. Non-infected individuals have `NA`.} +#' \item{qyes_total}{Cumulative count of times the player participated or scored positively in informative/educational "quarantine" questionnaires.} +#' \item{qno_total}{Cumulative count of times the non-quarantine questionnaire factor was registered.} +#' \item{mask_prop}{Proportion of time (across 15 steps) the participant used the mask intervention (0.0 to 1.0).} +#' \item{med_prop}{Proportion of time the individual used pharmacological interventions or treatments.} +#' \item{group}{Experimental group or node cohort.} +#' \item{final_score}{Final score obtained in the game.} +#' \item{status}{Final state label ("infected" or "not_infected").} +#' } +#' +#' **edgelist**: A longitudinal data frame with 23,684 rows and 4 variables representing edges/contacts: +#' \describe{ +#' \item{sender}{Origin node ID of the contact.} +#' \item{receiver}{Destination node ID of the contact.} +#' \item{time}{Time period of the contact (1 to 15).} +#' \item{weight}{Strength, duration, or density of the exposure.} +#' } +#' +#' @source WKU Epi Game simulation +#' @family diffusion datasets +#' @name epigames +NULL # "epigames" + +#' \code{diffnet} version of the Epi Games data +#' +#' A directed dynamic graph with 594 vertices and 15 time periods. The attributes +#' in the graph are described in \code{\link{epigames}}. +#' +#' Non-adopters have \code{toa = NA}. +#' +#' @format A \code{\link{diffnet}} class object. +#' @source WKU Epi Game simulation +#' @family diffusion datasets +#' @name epigamesDiffNet +NULL diff --git a/R/plot_diffnet2.r b/R/plot_diffnet2.r index f5cf8b6a..609afb0c 100644 --- a/R/plot_diffnet2.r +++ b/R/plot_diffnet2.r @@ -9,7 +9,7 @@ #' #' x <- rnorm(100) #' w <- data.frame(as.integer(round_to_seq(x, as_factor = TRUE)),x) -#' plot(w,x) +#' plot(w) #' #' @seealso Used in \code{\link{diffmap}} and \code{\link{plot_diffnet2}} round_to_seq <- function(x, nlevels=20, as_factor=FALSE) { diff --git a/README.md b/README.md index 6db64b07..bde38474 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ And the actual R package: Vega Yon G, Olivera Morales A, Valente T (2025). _netdiffuseR: Analysis of Diffusion and Contagion Processes on Networks_. doi:10.5281/zenodo.1039317 , - R package version 1.24.0, . + R package version 1.24.1, . To see these entries in BibTeX format, use 'print(, bibtex=TRUE)', 'toBibtex(.)', or set @@ -374,7 +374,7 @@ sessionInfo() #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: -#> [1] netdiffuseR_1.24.0 +#> [1] netdiffuseR_1.24.1 #> #> loaded via a namespace (and not attached): #> [1] Matrix_1.7-4 jsonlite_2.0.0 dplyr_1.1.4 diff --git a/data-raw/epigames.R b/data-raw/epigames.R new file mode 100644 index 00000000..48c54020 --- /dev/null +++ b/data-raw/epigames.R @@ -0,0 +1,13 @@ +# data-raw/epigames.R +# Pre-processing script for the EpiGames Raw Dataset + +rm(list = ls()) + +# The raw data consists of an attributes data frame and an hourly edgelist, +# both using consistent node IDs (1-594). +load("data-raw/epigames_hourly.rda") + +epigames <- epigames_hourly + +# Save compressed raw data +usethis::use_data(epigames, overwrite = TRUE, compress = "xz") diff --git a/data-raw/epigamesDiffNet.R b/data-raw/epigamesDiffNet.R new file mode 100644 index 00000000..a7d313fc --- /dev/null +++ b/data-raw/epigamesDiffNet.R @@ -0,0 +1,50 @@ +# data-raw/epigamesDiffNet.R +# Generating the dynamic diffnet object using netdiffuseR + collapse_timeframes() + +rm(list = ls()) +library(netdiffuseR) + +# Load the base raw dataset created in data-raw/epigames.R (hourly resolution) +load("data/epigames.rda") + +attrs <- epigames$attributes +edges <- epigames$edgelist + +# Collapse hourly edgelist (hours 0-338) into daily windows (days 1-15) +source("R/collapse_timeframes.R") + +daily_edgelist <- collapse_timeframes( + edgelist = edges, + ego = "sender", + alter = "receiver", + timevar = "time", + weightvar = "weight", + window_size = 24, + binarize = TRUE, + cumulative = TRUE, + symmetric = TRUE +) + +# Build daily adjacency matrices +adjmat <- edgelist_to_adjmat( + daily_edgelist[, c("sender", "receiver")], + w = daily_edgelist$weight, + t0 = daily_edgelist$time, + keep.isolates = TRUE, + multiple = TRUE +) + +max_t <- max(daily_edgelist$time, na.rm = TRUE) + +# Prepare TOA vector: real adoption times from attrs, NA for non-adopters +toa_vec <- stats::setNames(attrs$toa, as.character(attrs$id)) + +epigamesDiffNet <- as_diffnet( + adjmat, + toa = toa_vec, + vertex.static.attrs = attrs, + t0 = 1, + t1 = max_t +) + +usethis::use_data(epigamesDiffNet, overwrite = TRUE, compress = "xz") diff --git a/data-raw/epigames_hourly.rda b/data-raw/epigames_hourly.rda new file mode 100644 index 00000000..8f58de65 Binary files /dev/null and b/data-raw/epigames_hourly.rda differ diff --git a/data/epigames.rda b/data/epigames.rda new file mode 100644 index 00000000..0c0dda50 Binary files /dev/null and b/data/epigames.rda differ diff --git a/data/epigamesDiffNet.rda b/data/epigamesDiffNet.rda new file mode 100644 index 00000000..fc1a2deb Binary files /dev/null and b/data/epigamesDiffNet.rda differ diff --git a/man/brfarmers.Rd b/man/brfarmers.Rd index eec40819..e4e46ac6 100644 --- a/man/brfarmers.Rd +++ b/man/brfarmers.Rd @@ -193,6 +193,8 @@ Cresskill N.J.: Hampton Press. Other diffusion datasets: \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/brfarmersDiffNet.Rd b/man/brfarmersDiffNet.Rd index 51b4e7e1..f0c3439e 100644 --- a/man/brfarmersDiffNet.Rd +++ b/man/brfarmersDiffNet.Rd @@ -14,6 +14,8 @@ in the graph are static and described in \code{\link{brfarmers}}. Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/collapse_timeframes.Rd b/man/collapse_timeframes.Rd new file mode 100644 index 00000000..964fd264 --- /dev/null +++ b/man/collapse_timeframes.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collapse_timeframes.R +\name{collapse_timeframes} +\alias{collapse_timeframes} +\title{Collapse Timeframes in a Longitudinal Edgelist} +\usage{ +collapse_timeframes( + edgelist, + ego = "sender", + alter = "receiver", + timevar = "time", + weightvar = NULL, + window_size = 1, + time_format = NULL, + relative_time = TRUE, + binarize = FALSE, + cumulative = FALSE, + symmetric = FALSE +) +} +\arguments{ +\item{edgelist}{A \code{data.frame} representing the longitudinal edgelist.} + +\item{ego}{Character scalar. Name of the column representing the ego (sender).} + +\item{alter}{Character scalar. Name of the column representing the alter (receiver).} + +\item{timevar}{Character scalar. Name of the column representing the time variable.} + +\item{weightvar}{Character scalar or \code{NULL}. Name of the column representing +the edge weight. If \code{NULL}, the function tallies the number of interactions +within the time window as the weight.} + +\item{window_size}{Numeric scalar. The size of the time window to collapse into.} + +\item{time_format}{Character scalar or \code{NULL}. If the time variable is a +character or factor, the format passed to \code{as.POSIXct}. +For example, \code{"\%d-\%m-\%Y \%H:\%M"}.} + +\item{relative_time}{Logical scalar. If \code{TRUE}, normalizes the binned +times into a strict integer sequence starting at 1 (1, 2, 3...).} + +\item{binarize}{Logical scalar. If \code{TRUE}, sets all resulting edge weights to 1.} + +\item{cumulative}{Logical scalar. If \code{TRUE}, edges from previous time windows +are carried over to subsequent windows.} + +\item{symmetric}{Logical scalar. If \code{TRUE}, the resulting graph will be +symmetrized (i.e., if an edge A->B exists, an edge B->A is added).} +} +\value{ +A \code{data.frame} with 4 columns: the ego, the alter, the new collapsed + discrete time, and the aggregated weight. +} +\description{ +Allows users to take a high-resolution or continuous-time longitudinal +edgelist and dynamically collapse or discretize it into larger time windows. +The output is a shorter, aggregated edgelist ready to be passed into +\code{[edgelist_to_adjmat]} or \code{[as_diffnet]}. +} +\examples{ +\dontrun{ +# Load the package's hourly dataset +load(system.file("data/epigames_raw.rda", package = "netdiffuseR")) + +# Collapse the hourly edgelist into a daily edgelist (window_size = 24) +daily_edgelist <- collapse_timeframes( + edgelist = epigames_raw$edgelist, + timevar = "time", + weightvar = "weight", + window_size = 24 +) +head(daily_edgelist) +} +} diff --git a/man/diffusion-data.Rd b/man/diffusion-data.Rd index 10517b99..6d2426d5 100644 --- a/man/diffusion-data.Rd +++ b/man/diffusion-data.Rd @@ -15,23 +15,23 @@ medical innovation data originally collected by Coleman, Katz & Menzel (1966); the Brazilian Farmers collected as part of the three country study implemented by Everett Rogers (Rogers, Ascroft, & Röling, 1970), and Korean Family Planning data collected by researchers at the Seoul National University's School of -Public (Rogers & Kincaid, 1981). The table below summarizes the three datasets: +Public (Rogers & Kincaid, 1981). The table below summarizes the datasets: -\tabular{lccc}{ - \tab \bold{Medical Innovation} \tab \bold{Brazilian Farmers} \tab \bold{Korean Family Planning} \cr -\emph{Country} \tab USA \tab Brazil \tab Korean \cr -\emph{# Respondents} \tab 125 Doctors \tab 692 Farmers \tab 1,047 Women \cr -\emph{# Communities} \tab 4 \tab 11 \tab 25 \cr -\emph{Innovation} \tab Tetracycline \tab Hybrid Corn Seed \tab Family Planning \cr -\emph{Time for Diffusion} \tab 18 Months \tab 20 Years \tab 11 Years \cr -\emph{Year Data Collected} \tab 1955-1956 \tab 1966 \tab 1973 \cr -\emph{Ave. Time to 50\%} \tab 6 \tab 16 \tab 7 \cr -\emph{Highest Saturation} \tab 0.89 \tab 0.98 \tab 0.83 \cr -\emph{Lowest Saturation} \tab 0.81 \tab 0.29 \tab 0.44 \cr -\emph{Citation} \tab Coleman et al (1966) \tab Rogers et al (1970) \tab Rogers & Kincaid (1981) \cr +\tabular{lcccc}{ + \tab \bold{Medical Innovation} \tab \bold{Brazilian Farmers} \tab \bold{Korean Family Planning} \tab \bold{WKU Epi Games} \cr +\emph{Country} \tab USA \tab Brazil \tab Korean \tab USA \cr +\emph{# Respondents} \tab 125 Doctors \tab 692 Farmers \tab 1,047 Women \tab 594 Students \cr +\emph{# Communities} \tab 4 \tab 11 \tab 25 \tab Multiple groups \cr +\emph{Innovation} \tab Tetracycline \tab Hybrid Corn Seed \tab Family Planning \tab Masks/Medicine \cr +\emph{Time for Diffusion} \tab 18 Months \tab 20 Years \tab 11 Years \tab 15 Periods \cr +\emph{Year Data Collected} \tab 1955-1956 \tab 1966 \tab 1973 \tab Recent \cr +\emph{Ave. Time to 50\%} \tab 6 \tab 16 \tab 7 \tab N/A \cr +\emph{Highest Saturation} \tab 0.89 \tab 0.98 \tab 0.83 \tab N/A \cr +\emph{Lowest Saturation} \tab 0.81 \tab 0.29 \tab 0.44 \tab N/A \cr +\emph{Citation} \tab Coleman et al (1966) \tab Rogers et al (1970) \tab Rogers & Kincaid (1981) \tab WKU \cr } -All datasets include a column called \emph{study} which is coded as +All core datasets include a column called \emph{study} which is coded as (1) Medical Innovation (2) Brazilian Farmers, (3) Korean Family Planning. } \section{Right censored data}{ @@ -87,6 +87,8 @@ Analysis in the Social Sciences (pp. 98–116). New York: Cambridge University P Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/epigames.Rd b/man/epigames.Rd new file mode 100644 index 00000000..dd0f22af --- /dev/null +++ b/man/epigames.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.r +\name{epigames} +\alias{epigames} +\title{Epi Games Dataset} +\format{ +A list with two data frames: + +**attributes**: A data frame with 594 rows and 9 variables representing nodes: +\describe{ + \item{id}{Unique identifier for the participant.} + \item{toa}{Time of Adoption (1 to 15), representing when the individual was first infected. Non-infected individuals have `NA`.} + \item{qyes_total}{Cumulative count of times the player participated or scored positively in informative/educational "quarantine" questionnaires.} + \item{qno_total}{Cumulative count of times the non-quarantine questionnaire factor was registered.} + \item{mask_prop}{Proportion of time (across 15 steps) the participant used the mask intervention (0.0 to 1.0).} + \item{med_prop}{Proportion of time the individual used pharmacological interventions or treatments.} + \item{group}{Experimental group or node cohort.} + \item{final_score}{Final score obtained in the game.} + \item{status}{Final state label ("infected" or "not_infected").} +} + +**edgelist**: A longitudinal data frame with 23,684 rows and 4 variables representing edges/contacts: +\describe{ + \item{sender}{Origin node ID of the contact.} + \item{receiver}{Destination node ID of the contact.} + \item{time}{Time period of the contact (1 to 15).} + \item{weight}{Strength, duration, or density of the exposure.} +} +} +\source{ +WKU Epi Game simulation +} +\description{ +The WKU Epi Games dataset represents a simulated epidemic or game environment with +dynamic encounters over 15 time periods. It provides both node-level +attributes and a longitudinal edgelist. +} +\seealso{ +Other diffusion datasets: +\code{\link{brfarmers}}, +\code{\link{brfarmersDiffNet}}, +\code{\link{diffusion-data}}, +\code{\link{epigamesDiffNet}}, +\code{\link{fakeDynEdgelist}}, +\code{\link{fakeEdgelist}}, +\code{\link{fakesurvey}}, +\code{\link{fakesurveyDyn}}, +\code{\link{kfamily}}, +\code{\link{kfamilyDiffNet}}, +\code{\link{medInnovations}}, +\code{\link{medInnovationsDiffNet}} +} +\concept{diffusion datasets} diff --git a/man/epigamesDiffNet.Rd b/man/epigamesDiffNet.Rd new file mode 100644 index 00000000..f6338351 --- /dev/null +++ b/man/epigamesDiffNet.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.r +\name{epigamesDiffNet} +\alias{epigamesDiffNet} +\title{\code{diffnet} version of the Epi Games data} +\format{ +A \code{\link{diffnet}} class object. +} +\source{ +WKU Epi Game simulation +} +\description{ +A directed dynamic graph with 594 vertices and 15 time periods. The attributes +in the graph are described in \code{\link{epigames}}. +} +\details{ +Non-adopters have \code{toa = NA}. +} +\seealso{ +Other diffusion datasets: +\code{\link{brfarmers}}, +\code{\link{brfarmersDiffNet}}, +\code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{fakeDynEdgelist}}, +\code{\link{fakeEdgelist}}, +\code{\link{fakesurvey}}, +\code{\link{fakesurveyDyn}}, +\code{\link{kfamily}}, +\code{\link{kfamilyDiffNet}}, +\code{\link{medInnovations}}, +\code{\link{medInnovationsDiffNet}} +} +\concept{diffusion datasets} diff --git a/man/fakeDynEdgelist.Rd b/man/fakeDynEdgelist.Rd index abede09f..5740d5ae 100644 --- a/man/fakeDynEdgelist.Rd +++ b/man/fakeDynEdgelist.Rd @@ -24,6 +24,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, \code{\link{fakesurveyDyn}}, diff --git a/man/fakeEdgelist.Rd b/man/fakeEdgelist.Rd index 68578eb8..e620a609 100644 --- a/man/fakeEdgelist.Rd +++ b/man/fakeEdgelist.Rd @@ -23,6 +23,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakesurvey}}, \code{\link{fakesurveyDyn}}, diff --git a/man/fakesurvey.Rd b/man/fakesurvey.Rd index b3e8c2c7..edffef14 100644 --- a/man/fakesurvey.Rd +++ b/man/fakesurvey.Rd @@ -30,6 +30,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurveyDyn}}, diff --git a/man/fakesurveyDyn.Rd b/man/fakesurveyDyn.Rd index 90e4cfca..7f6b6ca9 100644 --- a/man/fakesurveyDyn.Rd +++ b/man/fakesurveyDyn.Rd @@ -31,6 +31,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/kfamily.Rd b/man/kfamily.Rd index 95337388..b1cf4499 100644 --- a/man/kfamily.Rd +++ b/man/kfamily.Rd @@ -472,6 +472,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/kfamilyDiffNet.Rd b/man/kfamilyDiffNet.Rd index da181df5..d7bf5b3c 100644 --- a/man/kfamilyDiffNet.Rd +++ b/man/kfamilyDiffNet.Rd @@ -15,6 +15,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/medInnovations.Rd b/man/medInnovations.Rd index 1620fe57..40f5dd1c 100644 --- a/man/medInnovations.Rd +++ b/man/medInnovations.Rd @@ -98,6 +98,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/medInnovationsDiffNet.Rd b/man/medInnovationsDiffNet.Rd index a636a22b..a56068b5 100644 --- a/man/medInnovationsDiffNet.Rd +++ b/man/medInnovationsDiffNet.Rd @@ -15,6 +15,8 @@ Other diffusion datasets: \code{\link{brfarmers}}, \code{\link{brfarmersDiffNet}}, \code{\link{diffusion-data}}, +\code{\link{epigames}}, +\code{\link{epigamesDiffNet}}, \code{\link{fakeDynEdgelist}}, \code{\link{fakeEdgelist}}, \code{\link{fakesurvey}}, diff --git a/man/round_to_seq.Rd b/man/round_to_seq.Rd index 834e7c97..450367a2 100644 --- a/man/round_to_seq.Rd +++ b/man/round_to_seq.Rd @@ -24,7 +24,7 @@ Takes a numeric vector and maps it into a finite length sequence x <- rnorm(100) w <- data.frame(as.integer(round_to_seq(x, as_factor = TRUE)),x) -plot(w,x) +plot(w) } \seealso{ diff --git a/tests/testthat/test-collapse_timeframes.R b/tests/testthat/test-collapse_timeframes.R new file mode 100644 index 00000000..657806c3 --- /dev/null +++ b/tests/testthat/test-collapse_timeframes.R @@ -0,0 +1,246 @@ +context("collapse_timeframes: collapsing longitudinal edgelists") + +# Base edgelist used across most tests: +# - 2 directed pairs: (1->2) and (2->3) +# - 4 time points: 1, 2, 3, 4 +# - Each pair appears twice per time point +el <- data.frame( + sender = c(1, 1, 1, 1, 2, 2, 2, 2), + receiver = c(2, 2, 2, 2, 3, 3, 3, 3), + time = c(1, 2, 3, 4, 1, 2, 3, 4), + weight = c(1, 1, 1, 1, 1, 1, 1, 1) +) + +# Block 1: Output structure ----------------------------------------------- + +test_that("collapse_timeframes returns a data.frame", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1) + expect_s3_class(result, "data.frame") +}) + +test_that("collapse_timeframes returns exactly 4 columns", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1) + expect_equal(ncol(result), 4L) +}) + +test_that("output column names match inputs", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1) + expect_named(result, c("sender", "receiver", "time", "weight")) +}) + +test_that("output has fewer or equal rows than input after collapsing", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 2) + expect_lte(nrow(result), nrow(el)) +}) + +# Block 2: Binning logic (window_size) ------------------------------------ + +test_that("window_size=1 does not merge periods", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1) + expect_equal(length(unique(result$time)), 4L) +}) + +test_that("window_size=2 merges 4 periods into 2 bins", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 2) + expect_equal(length(unique(result$time)), 2L) +}) + +test_that("window_size=4 merges all periods into 1 bin", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 4) + expect_equal(length(unique(result$time)), 1L) +}) + +test_that("aggregated weight is sum of constituent weights", { + # Two rows with weight=0.5 in bin 1 should aggregate to 1.0 + el2 <- data.frame( + sender = c(1, 1), + receiver = c(2, 2), + time = c(1, 2), + weight = c(0.5, 0.5) + ) + result <- collapse_timeframes(el2, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 2) + expect_equal(result$weight, 1.0) +}) + +# Block 3: relative_time TRUE / FALSE ------------------------------------- + +# Edgelist with a gap: time points 1, 2, 5, 6 (no 3 or 4) +el_gap <- data.frame( + sender = c(1, 1, 1, 1), + receiver = c(2, 2, 2, 2), + time = c(1, 2, 5, 6), + weight = c(1, 1, 1, 1) +) + +test_that("relative_time=TRUE produces a strict 1,2,... sequence", { + result <- collapse_timeframes(el_gap, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1, relative_time = TRUE) + expect_equal(sort(unique(result$time)), 1:4) +}) + +test_that("relative_time=FALSE preserves original bin values (may have gaps)", { + result <- collapse_timeframes(el_gap, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1, relative_time = FALSE) + expect_false(identical(sort(unique(result$time)), 1:4)) +}) + +# Block 4: Time column parsing (integer, POSIXct, character string) ------- + +test_that("integer time column is handled", { + el_int <- el + el_int$time <- as.integer(el_int$time) + result <- collapse_timeframes(el_int, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 2) + expect_equal(length(unique(result$time)), 2L) +}) + +test_that("POSIXct time column is handled", { + origin <- as.POSIXct("2024-01-01 00:00:00", tz = "UTC") + el_posix <- el + el_posix$time <- origin + (el$time - 1) * 3600 # 1 hour apart + result <- collapse_timeframes(el_posix, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 7200) # 2-hour windows (in seconds) + expect_equal(length(unique(result$time)), 2L) +}) + +test_that("character time with time_format is parsed correctly", { + el_chr <- el + el_chr$time <- format( + as.POSIXct("2024-01-01", tz = "UTC") + (el$time - 1) * 86400, + "%Y-%m-%d" + ) + result <- collapse_timeframes(el_chr, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 2 * 86400, # 2-day windows + time_format = "%Y-%m-%d") + expect_equal(length(unique(result$time)), 2L) +}) + +# Block 5: weightvar = NULL (count mode) vs explicit weight column -------- + +test_that("weightvar=NULL counts interactions as weight", { + el_now <- data.frame( + sender = c(1, 1, 1), + receiver = c(2, 2, 2), + time = c(1, 1, 1) + ) + result <- collapse_timeframes(el_now, ego = "sender", alter = "receiver", + timevar = "time", weightvar = NULL, + window_size = 1) + # 3 interactions in 1 bin -> weight should be 3 + expect_equal(result$weight, 3) +}) + +test_that("weightvar=NULL output column is named 'weight'", { + result <- collapse_timeframes(el[, c("sender","receiver","time")], + ego = "sender", alter = "receiver", + timevar = "time", weightvar = NULL, + window_size = 1) + expect_true("weight" %in% names(result)) +}) + +test_that("explicit weight column is summed correctly", { + el_w <- data.frame( + sender = c(1, 1), + receiver = c(2, 2), + time = c(1, 1), + w = c(3, 7) + ) + result <- collapse_timeframes(el_w, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "w", + window_size = 1) + expect_equal(result$w, 10) +}) + +# Block 6: Edge cases and error handling ---------------------------------- + +test_that("NAs in time column produce a warning", { + el_na <- el + el_na$time[1] <- NA + expect_warning( + collapse_timeframes(el_na, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1), + "NA" + ) +}) + +test_that("minimal input (1 pair, 1 period) works", { + el_min <- data.frame(sender = 1, receiver = 2, time = 1, weight = 1) + result <- collapse_timeframes(el_min, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1) + expect_equal(nrow(result), 1L) + expect_equal(result$time, 1L) +}) + +test_that("custom ego/alter column names are respected in output", { + el_custom <- el + names(el_custom) <- c("from", "to", "period", "weight") + result <- collapse_timeframes(el_custom, ego = "from", alter = "to", + timevar = "period", weightvar = "weight", + window_size = 2) + expect_named(result, c("from", "to", "period", "weight")) +}) + +test_that("output time starts at 1", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1) + expect_equal(min(result$time), 1L) +}) + +# Block 7: Post-aggregation processing (binarize, cumulative, symmetric) -- + +test_that("binarize=TRUE sets all weights to 1", { + result <- collapse_timeframes(el, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 2, binarize = TRUE) + expect_true(all(result$weight == 1)) +}) + +test_that("symmetric=TRUE adds reverse edges", { + # Asymmetric input: 1->2 only + el_asym <- data.frame(sender = 1, receiver = 2, time = 1, weight = 1) + result <- collapse_timeframes(el_asym, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1, symmetric = TRUE) + expect_equal(nrow(result), 2L) + # Check if reverse is present + expect_true(any(result$sender == 2 & result$receiver == 1)) +}) + +test_that("cumulative=TRUE carries edges forward", { + el_cum <- data.frame( + sender = c(1, 2), + receiver = c(2, 3), + time = c(1, 2), + weight = c(1, 1) + ) + result <- collapse_timeframes(el_cum, ego = "sender", alter = "receiver", + timevar = "time", weightvar = "weight", + window_size = 1, cumulative = TRUE) + # time=1 has 1->2, time=2 should have both 1->2 and 2->3 + expect_equal(nrow(result[result$time == 1, ]), 1L) + expect_equal(nrow(result[result$time == 2, ]), 2L) +})