diff --git a/R/LD.R b/R/LD.R index 8a7613a0..014c7cda 100644 --- a/R/LD.R +++ b/R/LD.R @@ -1,71 +1,107 @@ -#' Deduplicate and sort genomic regions by chromosome and start position. -#' @importFrom dplyr distinct arrange +#' Function to Check if Regions are in increasing order and remove duplicated rows +#' @importFrom dplyr distinct arrange group_by mutate ungroup #' @importFrom magrittr %>% #' @noRd order_dedup_regions <- function(df) { - df$chrom <- as.integer(strip_chr_prefix(df$chrom)) + # Ensure that 'chrom' values are integers, df can be genomic_data or regions_of_interest + df$chrom <- ifelse(grepl("^chr", df$chrom), + as.integer(sub("^chr", "", df$chrom)), # Remove 'chr' and convert to integer + as.integer(df$chrom) + ) # Convert to integer if not already + + # Remove duplicated rows based on 'chrom' and 'start' columns df <- distinct(df, chrom, start, .keep_all = TRUE) %>% arrange(chrom, start) - df + + # for (chr in unique(df$chrom)) { + # chr_rows <- which(df$chrom == chr) + # if (length(chr_rows) > 1) { + # starts <- df$start[chr_rows] + # for (i in 2:length(starts)) { + # if (starts[i] < starts[i - 1]) { + # stop("The input list of regions is not in increasing order within each chromosome.") + # } + # } + # } + # } + + return(df) } -#' Find the first and last rows of genomic_data that overlap a query region. -#' Clamps the query to the available data range before searching. +#' Function to Find Start and End Rows of Genomic Data for Region of Interest #' @importFrom dplyr filter arrange slice #' @noRd find_intersection_rows <- function(genomic_data, region_chrom, region_start, region_end) { + # Filter for the specific chromosome chrom_data <- genomic_data %>% filter(chrom == region_chrom) - if (nrow(chrom_data) == 0) stop("No data for chromosome ", region_chrom) - # Clamp query to available range - region_start <- max(region_start, min(chrom_data$start)) - region_end <- min(region_end, max(chrom_data$end)) + min_start <- if (nrow(chrom_data) > 0) min(chrom_data$start) else NA + max_end <- if (nrow(chrom_data) > 0) max(chrom_data$end) else NA + + # Adjust region bounds if they're outside available data range + if (!is.na(min_start) && region_start < min_start) { + region_start <- min_start + } + + if (!is.na(max_end) && region_end > max_end) { + region_end <- max_end + } + # Try to find rows that cover the region start and end start_row <- genomic_data %>% filter(chrom == region_chrom, start <= region_start, end > region_start) %>% slice(1) + end_row <- genomic_data %>% filter(chrom == region_chrom, start < region_end, end >= region_end) %>% arrange(desc(end)) %>% slice(1) if (nrow(start_row) == 0 || nrow(end_row) == 0) { - stop("Region ", region_chrom, ":", region_start, "-", region_end, - " is not covered by any rows in the LD metadata.") + stop("Region of interest is not covered by any rows in the data frame.") } + list(start_row = start_row, end_row = end_row) } -#' Validate that start_row..end_row fully covers [region_start, region_end]. +#' Function to Validate Selected Region #' @noRd validate_selected_region <- function(start_row, end_row, region_start, region_end) { - if (start_row$start > region_start || end_row$end < region_end) { - stop("Region ", region_start, "-", region_end, " is not fully covered by the LD metadata ", - "(available: ", start_row$start, "-", end_row$end, ").") + if (!(start_row$start <= region_start && end_row$end >= region_end)) { + stop("The selected region is not fully covered by the merged region.") } } -#' Extract values of a column for rows spanning the intersection range. +#' Extract File Paths Based on Intersection Criteria #' @noRd extract_file_paths <- function(genomic_data, intersection_rows, column_to_extract) { + # Ensure the file_path_column exists in genomic_data if (!column_to_extract %in% names(genomic_data)) { - stop("Column '", column_to_extract, "' not found in genomic data.") + stop(paste("Column", column_to_extract, "not found in genomic data")) } - idx <- which(genomic_data$chrom == intersection_rows$start_row$chrom & - genomic_data$start >= intersection_rows$start_row$start & - genomic_data$start <= intersection_rows$end_row$start) - genomic_data[[column_to_extract]][idx] + # Extract rows based on intersection criteria + extracted_paths <- genomic_data[[column_to_extract]][which( + genomic_data$chrom == intersection_rows$start_row$chrom & + genomic_data$start >= intersection_rows$start_row$start & + genomic_data$start <= intersection_rows$end_row$start + )] + + # Return the extracted paths + return(extracted_paths) } -#' Find LD blocks overlapping a query region from a metadata TSV file. +#' Intersect LD reference with Regions of Interest +#' +#' @param ld_reference_meta_file A file of data frame with columns "chrom", "start", "end", and "path" representing genomic regions. +#' "chrom" is the chromosome, "start" and "end" are the positions of the LD block, and "path" is the file path for the LD block. +#' @param region A data frame with columns "chrom", "start", and "end" specifying regions of interest. +#' "start" and "end" are the positions of these regions. Or it can take the form of `chr:start-end` #' -#' @param ld_reference_meta_file TSV with columns chrom, start, end, path. -#' The path column may be comma-separated: "ld_file,bim_file". -#' @param region "chr:start-end" string or data.frame with chrom/start/end. -#' @param complete_coverage_required If TRUE, error when the region extends -#' beyond available LD blocks. -#' @return A list with: intersections (LD_file_paths, bim_file_paths), -#' ld_meta_data, and parsed region. +#' @return A list containing processed genomic data, region of interest, and a detailed result list. +#' The result list contains, for each region: +#' - The start and end row indices in the genomic data +#' - File paths from the genomic data corresponding to intersected regions +#' - Optionally, bim file paths if available #' @importFrom stringr str_split #' @importFrom dplyr select #' @importFrom vroom vroom @@ -117,298 +153,183 @@ get_regional_ld_meta <- function(ld_reference_meta_file, region, complete_covera )) } -#' Read a pre-computed LD matrix (.cor.xz) and its bim file, returning a -#' symmetric matrix with variants ordered by position. #' @importFrom dplyr mutate #' @importFrom utils read.table #' @importFrom stats setNames -#' @noRd +# Process an LD matrix from a file path process_LD_matrix <- function(LD_file_path, bim_file_path) { - # Read .cor.xz matrix + # Order the variants by position + order_variants_by_position <- function(strings) { + # Apply the function to each variant to get a vector of positions + positions <- sapply(strings, function(variant) as.integer(strsplit(variant, ":")[[1]][2])) + # Check whether the merged variants is orderd + # when diff() returns 0, it is multiallelic at the position (same position but >1 variations) + if (!all(diff(positions[order(positions)]) >= 0)) { + stop("The positions are not in non-decreasing order") + } + # Order the variants by position + strings_ordered <- strings[order(positions)] + return(strings_ordered) + } + # Read the LD matrix LD_file_con <- xzfile(LD_file_path) LD_matrix <- scan(LD_file_con, quiet = TRUE) close(LD_file_con) LD_matrix <- matrix(LD_matrix, ncol = sqrt(length(LD_matrix)), byrow = TRUE) - # Read bim (variant metadata) - bim_file_name <- if (!is.null(bim_file_path)) bim_file_path else paste0(LD_file_path, ".bim") + # Process the bim file to extract variant information + bim_file_name <- if (!is.null(bim_file_path)) { + bim_file_path + } else { + paste0(LD_file_path, ".bim", sep = "") + } + + # Process variant names from file paths LD_variants <- read.table(bim_file_name) - col_names <- if (ncol(LD_variants) == 9) { - c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") + if (ncol(LD_variants) == 9) { + LD_variants <- LD_variants %>% + setNames(c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")) %>% + mutate(chrom = as.character(as.integer(sub("^chr", "", chrom)))) %>% + mutate(variants = normalize_variant_id(variants)) } else if (ncol(LD_variants) == 6) { - c("chrom", "variants", "GD", "pos", "A1", "A2") + LD_variants <- LD_variants %>% + setNames(c("chrom", "variants", "GD", "pos", "A1", "A2")) %>% + mutate(chrom = as.character(as.integer(sub("^chr", "", chrom)))) %>% + mutate(variants = normalize_variant_id(variants)) } else { - stop("Unexpected number of columns (", ncol(LD_variants), ") in bim file: ", bim_file_name) + stop("Unexpected number of columns in the input file.") } - LD_variants <- LD_variants %>% - setNames(col_names) %>% - mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), - variants = normalize_variant_id(variants)) - # Label and symmetrize the matrix + + # Set column and row names of the LD matrix colnames(LD_matrix) <- rownames(LD_matrix) <- LD_variants$variants - if (all(LD_matrix[lower.tri(LD_matrix)] == 0)) { + # Check if the matrix is upper diagonal + # We assume a matrix is upper diagonal if all elements below the main diagonal are zero + is_upper_diagonal <- all(LD_matrix[lower.tri(LD_matrix)] == 0) + if (is_upper_diagonal) { + # If the matrix is upper diagonal, transpose the upper triangle to the lower triangle LD_matrix[lower.tri(LD_matrix)] <- t(LD_matrix)[lower.tri(LD_matrix)] } else { + # If the matrix is lower diagonal, transpose the lower triangle to the upper triangle LD_matrix[upper.tri(LD_matrix)] <- t(LD_matrix)[upper.tri(LD_matrix)] } - - # Order variants by genomic position - pos_order <- order(sapply(LD_variants$variants, function(v) as.integer(strsplit(v, ":")[[1]][2]))) - LD_variants <- LD_variants[pos_order, ] - LD_matrix <- LD_matrix[LD_variants$variants, LD_variants$variants] - - list(LD_matrix = LD_matrix, LD_variants = LD_variants) + LD_variants_ordered <- LD_variants[match(order_variants_by_position(LD_variants$variants), LD_variants$variants), ] + LD_matrix <- LD_matrix[match(LD_variants_ordered$variants, rownames(LD_matrix)), match(LD_variants_ordered$variants, rownames(LD_matrix))] + list(LD_matrix = LD_matrix, LD_variants = LD_variants_ordered) } -#' Subset an LD matrix and variant info to a genomic region, optionally -#' further restricted to specific coordinates. +#' Extract LD matrix and variants for a specific region #' @importFrom dplyr mutate select #' @importFrom magrittr %>% -#' @noRd +#' @importFrom utils tail extract_LD_for_region <- function(LD_matrix, variants, region, extract_coordinates) { - extracted <- subset(variants, chrom == region$chrom & pos >= region$start & pos <= region$end) - + # Filter variants based on region + extracted_LD_variants <- subset(variants, chrom == region$chrom & pos >= region$start & pos <= region$end) if (!is.null(extract_coordinates)) { + # Preprocess 'extract_coordinate' to ensure 'chrom' is numeric and without 'chr' extract_coordinates <- extract_coordinates %>% - mutate(chrom = as.integer(strip_chr_prefix(chrom))) %>% + mutate(chrom = ifelse(grepl("^chr", chrom), as.integer(sub("^chr", "", chrom)), chrom)) %>% select(chrom, pos) - extracted <- extracted %>% - mutate(chrom = as.integer(strip_chr_prefix(chrom))) %>% + # Now merge with 'LD_variants_region_selected' + extracted_LD_variants <- extracted_LD_variants %>% + # Ensure that 'chrom' values in 'LD_variants_region_selected' are numeric, remove 'chr' if present + mutate(chrom = ifelse(grepl("^chr", chrom), as.integer(sub("^chr", "", chrom)), as.integer(chrom))) %>% + # Merge with 'extract_coordinate' after 'chrom' adjustment merge(extract_coordinates, by = c("chrom", "pos")) - keep_cols <- intersect(c("chrom", "variants", "pos", "GD", "A1", "A2", - "variance", "allele_freq", "n_nomiss"), names(extracted)) - extracted <- select(extracted, all_of(keep_cols)) + # Select the desired columns, assuming 'variants' column is equivalent to the 'variants' in 'LD_variants_region_selected' + # Select columns dynamically based on the presence of 'variance' + cols_to_select <- c("chrom", "variants", "pos", "GD", "A1", "A2") # select(chrom, variants, pos, GD, A1, A2) + if ("variance" %in% names(extracted_LD_variants)) { + cols_to_select <- c(cols_to_select, "variance") + } + extracted_LD_variants <- select(extracted_LD_variants, all_of(cols_to_select)) } - - mat <- LD_matrix[extracted$variants, extracted$variants, drop = FALSE] - list(extracted_LD_matrix = mat, extracted_LD_variants = extracted) + # Extract LD matrix + extracted_LD_matrix <- LD_matrix[extracted_LD_variants$variants, extracted_LD_variants$variants, drop = FALSE] + list(extracted_LD_matrix = extracted_LD_matrix, extracted_LD_variants = extracted_LD_variants) } -#' Combine multiple block-level LD matrices into one, handling boundary overlaps. -#' @importFrom utils tail -#' @noRd -create_LD_matrix <- function(LD_matrices, variants) { - # Merge variant lists, deduplicating boundary overlaps - merge_variants <- function(variant_list) { - merged <- character(0) - for (v in variant_list) { - ids <- if (is.list(v) && !is.null(v$variants)) v$variants else v - if (length(ids) == 0) next - if (length(merged) > 0 && tail(merged, 1) == ids[1]) ids <- ids[-1] - merged <- c(merged, ids) +# Create a combined LD matrix from multiple matrices +create_combined_LD_matrix <- function(LD_matrices, variants) { + # Updated mergeVariants helper that checks the structure of each element. + mergeVariants <- function(LD_variants_list) { + mergedVariants <- character(0) + # Loop over the list of variant information + for (LD_variants in LD_variants_list) { + currentVariants <- if (is.list(LD_variants) && !is.null(LD_variants$variants)) { + LD_variants$variants + } else { + LD_variants + } + + if (length(currentVariants) == 0) next + + # If the last variant in mergedVariants is the same as the first of currentVariants, skip the duplicate + if (length(mergedVariants) > 0 && tail(mergedVariants, 1) == currentVariants[1]) { + mergedVariants <- c(mergedVariants, currentVariants[-1]) + } else { + mergedVariants <- c(mergedVariants, currentVariants) + } } - merged + return(mergedVariants) } - all_variants <- merge_variants(variants) - combined <- matrix(0, nrow = length(all_variants), ncol = length(all_variants), - dimnames = list(all_variants, all_variants)) + unique_variants <- mergeVariants(variants) + # Initialize an empty combined LD matrix with the unique variants + combined_LD_matrix <- matrix(0, nrow = length(unique_variants), ncol = length(unique_variants)) + rownames(combined_LD_matrix) <- unique_variants + colnames(combined_LD_matrix) <- unique_variants - # Place each block into the combined matrix - for (i in seq_along(LD_matrices)) { - v <- rownames(LD_matrices[[i]]) - idx <- match(v, all_variants) - combined[idx, idx] <- LD_matrices[[i]] + # Function to align the values from each LD matrix to the combined matrix + align_matrix <- function(ld_matrix, combined_matrix, variant_names) { + indices <- match(variant_names, rownames(combined_matrix)) + combined_matrix[indices, indices] <- ld_matrix + return(combined_matrix) } - combined + + # Use Map to pair each LD matrix with its rownames, then Reduce to fill the combined matrix. + combined_LD_matrix <- Reduce( + function(x, y) align_matrix(y[[1]], x, y[[2]]), + Map(list, LD_matrices, lapply(LD_matrices, rownames)), + combined_LD_matrix + ) + + return(combined_LD_matrix) } #' Load and Process Linkage Disequilibrium (LD) Matrix #' -#' Unified entry point for loading LD data. Auto-detects the source type: -#' \itemize{ -#' \item Pre-computed LD blocks (metadata file with chrom/start/end/path columns) -#' \item PLINK2 genotype files (.pgen/.pvar[.zst]/.psam + .afreq) -#' \item PLINK1 genotype files (.bed/.bim/.fam) -#' } -#' For PLINK genotype sources, LD is computed on the fly via \code{compute_LD()}. -#' -#' @param LD_meta_file_path Path to LD metadata file, or prefix for PLINK genotype files. -#' @param region Region of interest: "chr:start-end" string or data.frame with chrom/start/end. -#' @param extract_coordinates Optional data.frame with columns "chrom" and "pos" for -#' specific coordinates extraction (only for pre-computed LD blocks). -#' @param return_genotype If TRUE, return the genotype matrix X instead of the LD -#' correlation matrix R. Only valid for PLINK genotype sources. -#' @param n_sample Optional sample size for computing variance (= 2*p*(1-p)*n/(n-1)). -#' If NULL, ref_panel will not include variance or n_nomiss columns. -#' Only used for PLINK genotype sources. +#' @param LD_meta_file_path path of LD_metadata, LD_metadata is a data frame specifying LD blocks with +#' columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks. +#' "path" is the path of each LD block, optionally including bim file paths. +#' @param region A data frame specifying region of interest with columns "chrom", "start", and "end". +#' @param extract_coordinates Optional data frame with columns "chrom" and "pos" for specific coordinates extraction. #' -#' @return A list with: +#' @return A list of processed LD matrices and associated variant data frames for region of interest. +#' Each element of the list contains: #' \describe{ -#' \item{LD_variants}{Character vector of variant IDs (canonical format).} -#' \item{LD_matrix}{Symmetric LD correlation matrix (or genotype matrix if return_genotype=TRUE).} -#' \item{ref_panel}{Data.frame with variant metadata (chrom, pos, A2, A1, variant_id, -#' and optionally allele_freq, variance, n_nomiss).} -#' \item{block_metadata}{Data.frame with block info (block_id, chrom, block_start, block_end, size, start_idx, end_idx).} +#' \item{combined_LD_variants}{A data frame merging selected variants within each LD block in bim file format with columns "chrom", "variants", +#' "GD", "pos", "A1", and "A2".} +#' \item{combined_LD_matrix}{The LD matrix for each region, with row and column names matching variant identifiers.} +#' \item{block_indices}{A data frame tracking the indices of variants in the combined matrix by LD block, +#' with columns "block_id", "start_idx", "end_idx", "chrom", "block_start", "block_end" to facilitate +#' further partitioning if needed.} #' } #' @export -load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL, - return_genotype = FALSE, n_sample = NULL) { - source <- resolve_ld_source(LD_meta_file_path) - - if (source$type %in% c("plink2", "plink1")) { - return(load_LD_from_genotype(source$data_path, region, source$type, - return_genotype = return_genotype, - n_sample = n_sample)) - } - - # Pre-computed LD blocks (.cor.xz) - if (return_genotype) { - stop("return_genotype=TRUE requires PLINK genotype files, not pre-computed LD matrices.") - } - load_LD_from_blocks(source$meta_path, region, extract_coordinates, n_sample = n_sample) -} - -# ---------- Internal: resolve LD source type ---------- - -#' @noRd -has_plink2_files <- function(prefix) { - file.exists(paste0(prefix, ".pgen")) && - (file.exists(paste0(prefix, ".pvar")) || file.exists(paste0(prefix, ".pvar.zst"))) && - file.exists(paste0(prefix, ".psam")) -} - -#' @noRd -has_plink1_files <- function(prefix) { - file.exists(paste0(prefix, ".bed")) && - file.exists(paste0(prefix, ".bim")) && - file.exists(paste0(prefix, ".fam")) -} - -#' Resolve an LD source path to its actual data type. -#' -#' Handles both direct PLINK prefixes and metadata TSV files (which may -#' themselves point to PLINK files or pre-computed .cor.xz matrices). -#' -#' @param path Direct PLINK prefix or metadata TSV file path. -#' @return A list with: -#' \item{type}{"plink2", "plink1", or "precomputed"} -#' \item{data_path}{Resolved PLINK prefix (for plink types)} -#' \item{meta_path}{Metadata TSV path (for precomputed, or when input was a TSV)} -#' @importFrom vroom vroom -#' @noRd -resolve_ld_source <- function(path) { - # Direct PLINK prefix? - if (has_plink2_files(path)) return(list(type = "plink2", data_path = path)) - if (has_plink1_files(path)) return(list(type = "plink1", data_path = path)) - - # Must be a metadata file - if (!file.exists(path)) { - stop("Cannot determine LD source from path: ", path, - "\n Expected: PLINK2 prefix (.pgen/.pvar[.zst]/.psam), ", - "PLINK1 prefix (.bed/.bim/.fam), or LD metadata file.") - } - - # Peek at first row's path column to determine underlying data type - meta <- as.data.frame(vroom(path, show_col_types = FALSE, n_max = 1)) - colnames(meta)[1] <- "chrom" - raw_path <- gsub(",.*$", "", meta$path[1]) # strip comma-separated bim path - resolved <- file.path(dirname(path), raw_path) - - if (has_plink2_files(resolved)) return(list(type = "plink2", data_path = resolved, meta_path = path)) - if (has_plink1_files(resolved)) return(list(type = "plink1", data_path = resolved, meta_path = path)) - - # Pre-computed .cor.xz blocks - list(type = "precomputed", meta_path = path) -} - -# ---------- Internal: load LD from genotype files ---------- - -#' Load genotype data from PLINK files and compute LD or return genotype matrix. -#' @param source_type Character, "plink1" or "plink2" (from resolve_ld_source). -#' @noRd -load_LD_from_genotype <- function(prefix, region, source_type, - return_genotype = FALSE, n_sample = NULL) { - # Load genotype matrix and variant info - result <- if (source_type == "plink2") { - load_plink2_data(prefix, region = region) - } else { - load_plink1_data(prefix, region = region) - } - X <- result$X - variant_info <- result$variant_info - - # Normalize variant IDs to canonical format (chr:pos:A2:A1) - variant_ids <- normalize_variant_id( - format_variant_id(variant_info$chrom, variant_info$pos, variant_info$A2, variant_info$A1) - ) - colnames(X) <- variant_ids - - # Build ref_panel - ref_panel <- parse_variant_id(variant_ids) - ref_panel$variant_id <- variant_ids - - # Load allele frequency from .afreq file (required for PLINK sources) - afreq <- read_afreq(prefix) - if (is.null(afreq)) { - stop("Allele frequency file (.afreq or .afreq.zst) not found at prefix: ", prefix, - "\n The .afreq file is required for PLINK genotype LD sources.") - } - freq_match <- match(variant_info$id, afreq$id) - n_unmatched <- sum(is.na(freq_match)) - if (n_unmatched > 0) { - warning(n_unmatched, " out of ", length(freq_match), - " variants have no allele frequency in .afreq file.") - } - ref_panel$allele_freq <- afreq$alt_freq[freq_match] - - # Compute variance if sample size provided - if (!is.null(n_sample)) { - p <- ref_panel$allele_freq - ref_panel$variance <- 2 * p * (1 - p) * n_sample / (n_sample - 1) - ref_panel$n_nomiss <- n_sample - } - - # Block metadata (single block spanning the loaded region) - positions <- variant_info$pos - block_metadata <- data.frame( - block_id = 1L, - chrom = as.character(variant_info$chrom[1]), - block_start = min(positions), - block_end = max(positions), - size = length(variant_ids), - start_idx = 1L, - end_idx = length(variant_ids), - stringsAsFactors = FALSE - ) - - if (return_genotype) { - # Note: LD_matrix holds the genotype matrix X (not LD) when return_genotype=TRUE - return(list( - LD_variants = variant_ids, - LD_matrix = X, - ref_panel = ref_panel, - block_metadata = block_metadata - )) - } - - # Compute LD correlation matrix - R <- compute_LD(X, method = "sample") - - list( - LD_variants = variant_ids, - LD_matrix = R, - ref_panel = ref_panel, - block_metadata = block_metadata - ) -} - -# ---------- Internal: load LD from pre-computed blocks ---------- - -#' Load pre-computed LD from block-based metadata files. -#' @noRd -load_LD_from_blocks <- function(LD_meta_file_path, region, extract_coordinates = NULL, n_sample = NULL) { - # Intersect LD metadata with specified regions +load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL) { + # Intersect LD metadata with specified regions using updated function intersected_LD_files <- get_regional_ld_meta(LD_meta_file_path, region) + # Extract file paths for LD and bim files LD_file_paths <- intersected_LD_files$intersections$LD_file_paths bim_file_paths <- intersected_LD_files$intersections$bim_file_paths + # Using a for loop here to allow for rm() in each loop to save memory extracted_LD_matrices_list <- list() extracted_LD_variants_list <- list() block_chroms <- character(length(LD_file_paths)) + # Process each LD block individually for (j in seq_along(LD_file_paths)) { LD_matrix_processed <- process_LD_matrix(LD_file_paths[j], bim_file_paths[j]) extracted_LD_list <- extract_LD_for_region( @@ -422,112 +343,119 @@ load_LD_from_blocks <- function(LD_meta_file_path, region, extract_coordinates = if (nrow(extracted_LD_variants_list[[j]]) > 0) { block_chroms[j] <- as.character(extracted_LD_variants_list[[j]]$chrom[1]) } else { + # Use region chromosome as fallback for empty blocks block_chroms[j] <- as.character(intersected_LD_files$region$chrom) } } - LD_matrix <- create_LD_matrix( + # Filter out empty blocks before combining + non_empty <- sapply(extracted_LD_variants_list, function(v) nrow(v) > 0) + if (!any(non_empty)) { + stop("No variants found in any LD block for the specified region.") + } + if (any(!non_empty)) { + message(paste( + "Removing", sum(!non_empty), "empty LD block(s) with no variants in the region." + )) + extracted_LD_matrices_list <- extracted_LD_matrices_list[non_empty] + extracted_LD_variants_list <- extracted_LD_variants_list[non_empty] + block_chroms <- block_chroms[non_empty] + LD_file_paths <- LD_file_paths[non_empty] + } + + # Create combined LD matrix + combined_LD_matrix <- create_combined_LD_matrix( LD_matrices = extracted_LD_matrices_list, variants = extracted_LD_variants_list ) - LD_variants <- rownames(LD_matrix) + # Variants are already in canonical format (chr prefix) from process_LD_matrix / normalize_variant_id + combined_LD_variants <- rownames(combined_LD_matrix) + # Build block metadata — variants already in canonical format, no normalization needed block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants) - block_positions <- lapply(extracted_LD_variants_list, function(v) v$pos) block_metadata <- data.frame( block_id = seq_along(LD_file_paths), chrom = block_chroms, - block_start = sapply(block_positions, min), - block_end = sapply(block_positions, max), size = sapply(block_variants, length), - start_idx = sapply(block_variants, function(v) min(match(v, LD_variants))), - end_idx = sapply(block_variants, function(v) max(match(v, LD_variants))), + start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))), + end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))), stringsAsFactors = FALSE ) + # Remove large objects to free memory rm(extracted_LD_matrices_list) - ref_panel <- parse_variant_id(rownames(LD_matrix)) + # Create reference panel from canonical variant IDs + ref_panel <- parse_variant_id(rownames(combined_LD_matrix)) merged_variant_list <- do.call(rbind, extracted_LD_variants_list) - ref_panel$variant_id <- rownames(LD_matrix) + ref_panel$variant_id <- rownames(combined_LD_matrix) - if ("allele_freq" %in% colnames(merged_variant_list)) { - ref_panel$allele_freq <- merged_variant_list$allele_freq[match(rownames(LD_matrix), merged_variant_list$variants)] - } if ("variance" %in% colnames(merged_variant_list)) { - ref_panel$variance <- merged_variant_list$variance[match(rownames(LD_matrix), merged_variant_list$variants)] - } - if ("n_nomiss" %in% colnames(merged_variant_list)) { - ref_panel$n_nomiss <- merged_variant_list$n_nomiss[match(rownames(LD_matrix), merged_variant_list$variants)] + ref_panel$variance <- merged_variant_list$variance[match(rownames(combined_LD_matrix), merged_variant_list$variants)] } - # Compute variance from n_sample + allele_freq if not already present - if (!is.null(n_sample) && (!"variance" %in% colnames(ref_panel) || all(is.na(ref_panel$variance)))) { - if ("allele_freq" %in% colnames(ref_panel)) { - p <- ref_panel$allele_freq - ref_panel$variance <- 2 * p * (1 - p) * n_sample / (n_sample - 1) - ref_panel$n_nomiss <- n_sample - } - } - - list( - LD_variants = LD_variants, - LD_matrix = LD_matrix, + # Return the combined LD list + combined_LD_list <- list( + combined_LD_variants = combined_LD_variants, + combined_LD_matrix = combined_LD_matrix, ref_panel = ref_panel, block_metadata = block_metadata ) + + return(combined_LD_list) } #' Filter variants by LD Reference #' -#' Filters a vector of variant IDs to those present in the LD reference panel. -#' Auto-detects the reference type (PLINK2, PLINK1, or pre-computed LD metadata). -#' -#' @param variant_ids variant names in the format chr:pos:ref:alt. -#' @param ld_reference_meta_file Path to LD metadata file or PLINK prefix. -#' @param keep_indel Whether to keep indel variants. Default TRUE. -#' @return A list with: -#' \item{data}{Character vector of filtered variant IDs.} -#' \item{idx}{Integer vector of indices into the original variant_ids.} -#' @importFrom dplyr group_by summarise +#' @param variant_ids variant names in the format chr:pos_ref_alt or chr:pos:ref:alt. +#' @param ld_reference_meta_file A data frame similar to 'genomic_data' in get_regional_ld_meta function. +#' @return A subset of variants, filtered based on LD reference data. +#' @importFrom stringr str_split +#' @importFrom dplyr select group_by summarise #' @importFrom vroom vroom -#' @importFrom magrittr %>% #' @export filter_variants_by_ld_reference <- function(variant_ids, ld_reference_meta_file, keep_indel = TRUE) { - variants_df <- parse_variant_id(variant_ids) - - # Derive region to scope the reference lookup + # Step 1: Process variant IDs into a data frame and filter out non-standard nucleotides + variants_df <- do.call(rbind, lapply(strsplit(variant_ids, ":"), function(x) { + data.frame(chrom = x[1], pos = as.integer(x[2]), ref = x[3], alt = x[4]) + })) + + variants_df$chrom <- ifelse(grepl("^chr", variants_df$chrom), + as.integer(sub("^chr", "", variants_df$chrom)), # Remove 'chr' and convert to integer + as.integer(variants_df$chrom) + ) + # Step 2: Derive region information from the variants data frame region_df <- variants_df %>% group_by(chrom) %>% summarise(start = min(pos), end = max(pos)) - - # Use shared helper -- no genotype loading - ref_info <- get_ref_variant_info(ld_reference_meta_file, region_df) - ref_chrom <- as.integer(strip_chr_prefix(ref_info$chrom)) - ref_key <- paste0(ref_chrom, ":", ref_info$pos) - - variant_key <- paste0(variants_df$chrom, ":", variants_df$pos) - keep_indices <- which(variant_key %in% ref_key) - + # Step 3: Call get_regional_ld_meta to get bim_file_paths + bim_file_paths <- get_regional_ld_meta(ld_reference_meta_file, region_df)$intersections$bim_file_paths + + # Step 4: Load bim files and consolidate into a single data frame + bim_data <- lapply(bim_file_paths, function(path) { + bim_df <- vroom(path, col_names = FALSE) + data.frame(chrom = bim_df$X1, pos = bim_df$X4, stringsAsFactors = FALSE) + }) %>% + do.call("rbind", .) + + # Step 5: Overlap the variants data frame with bim_data + keep_indices <- which(paste(variants_df$chrom, variants_df$pos) %in% paste(bim_data$chrom, bim_data$pos)) if (!keep_indel) { - snp_idx <- which(is_snp_alleles(variants_df$A1, variants_df$A2)) + valid_nucleotides <- c("A", "T", "C", "G") + snp_idx <- which((variants_df$ref %in% valid_nucleotides) & (variants_df$alt %in% valid_nucleotides)) keep_indices <- intersect(keep_indices, snp_idx) } + variants_filtered <- variant_ids[keep_indices] - message(length(variant_ids) - length(keep_indices), " out of ", length(variant_ids), - " total variants dropped due to absence on the reference LD panel.") + message(length(variant_ids) - length(keep_indices), " out of ", length(variant_ids), " total variants dropped due to absence on the reference LD panel.") - list(data = variant_ids[keep_indices], idx = keep_indices) + return(list(data = variants_filtered, idx = keep_indices)) } #' Partition LD Matrix into Block-Specific Matrices #' -#' This function takes the output from load_LD_matrix and partitions the combined LD matrix -#' into a list of smaller matrices based on the block_indices, making it easier to work with -#' large LD matrices that span multiple blocks. -#' -#' @param ld_data A list as returned by load_LD_matrix, containing LD_matrix, -#' LD_variants, ref_panel, and block_metadata. +#' @param ld_data A list as returned by load_LD_matrix, containing combined_LD_matrix, +#' combined_LD_variants, ref_panel, and block_metadata. #' @param merge_small_blocks Logical, whether to merge blocks smaller than min_merged_block_size (default: TRUE). #' @param min_merged_block_size Integer, minimum number of variants for a block after merging (default: 500). #' @param max_merged_block_size Integer, maximum number of variants in a block after merging (default: 10000). @@ -542,9 +470,9 @@ filter_variants_by_ld_reference <- function(variant_ids, ld_reference_meta_file, partition_LD_matrix <- function(ld_data, merge_small_blocks = TRUE, min_merged_block_size = 500, max_merged_block_size = 10000) { # Extract components from ld_data - combined_matrix <- ld_data$LD_matrix + combined_matrix <- ld_data$combined_LD_matrix block_metadata <- ld_data$block_metadata - variant_ids <- ld_data$LD_variants + variant_ids <- ld_data$combined_LD_variants # Error if matrix is empty if (is.null(combined_matrix) || nrow(combined_matrix) == 0 || ncol(combined_matrix) == 0) { @@ -558,6 +486,30 @@ partition_LD_matrix <- function(ld_data, merge_small_blocks = TRUE, colnames(combined_matrix) <- variant_ids } + # Filter out blocks with invalid indices (empty blocks, out-of-range, NA, Inf) + n_variants <- length(variant_ids) + valid_blocks <- sapply(seq_len(nrow(block_metadata)), function(i) { + s <- block_metadata$start_idx[i] + e <- block_metadata$end_idx[i] + sz <- block_metadata$size[i] + # Block is valid if: size > 0, indices are finite integers, and within range + !is.na(s) && !is.na(e) && is.finite(s) && is.finite(e) && + sz > 0 && s >= 1 && e >= s && e <= n_variants + }) + + if (!any(valid_blocks)) { + stop("No valid LD blocks found. All block indices are out of range or empty.") + } + + if (any(!valid_blocks)) { + message(paste( + "Removing", sum(!valid_blocks), + "LD block(s) with invalid or out-of-range indices." + )) + block_metadata <- block_metadata[valid_blocks, , drop = FALSE] + block_metadata$block_id <- seq_len(nrow(block_metadata)) + } + # Validate the block structure of the matrix (skip if only one block) if (nrow(block_metadata) > 1) { validate_block_structure(combined_matrix, block_metadata, variant_ids) @@ -573,79 +525,176 @@ partition_LD_matrix <- function(ld_data, merge_small_blocks = TRUE, return(result) } -#' Validate that cross-block entries are zero (excluding boundary variants). -#' @noRd +# Helper function to validate block structure validate_block_structure <- function(matrix, block_metadata, variant_ids) { - msgs <- character(0) - n <- length(variant_ids) + validation_failed <- FALSE + validation_message <- character(0) + # Iterate over all pairs of distinct blocks for (i in 1:(nrow(block_metadata) - 1)) { for (j in (i + 1):nrow(block_metadata)) { - si <- block_metadata$start_idx[i]; ei <- block_metadata$end_idx[i] - sj <- block_metadata$start_idx[j]; ej <- block_metadata$end_idx[j] - if (si > n || ei > n || sj > n || ej > n) { - msgs <- c(msgs, paste("Block indices out of range for blocks", i, "and", j)) + # Get indices for each block + block_i_start <- block_metadata$start_idx[i] + block_i_end <- block_metadata$end_idx[i] + block_j_start <- block_metadata$start_idx[j] + block_j_end <- block_metadata$end_idx[j] + + # Check if indices are valid + if (block_i_start > length(variant_ids) || block_i_end > length(variant_ids) || + block_j_start > length(variant_ids) || block_j_end > length(variant_ids)) { + validation_failed <- TRUE + validation_message <- c( + validation_message, + paste("Block indices are out of range for blocks", i, "and", j) + ) next } - # Exclude boundary variants (potential overlaps) - vi <- variant_ids[si:(ei - 1)] - vj <- variant_ids[(sj + 1):ej] - if (length(vi) > 0 && length(vj) > 0) { - max_val <- max(abs(matrix[vi, vj, drop = FALSE])) - if (max_val > 1e-10) { - msgs <- c(msgs, paste("Non-zero correlation between blocks", i, "and", j, - "- max:", max_val)) + + # Get variants for each block, EXCLUDING THE BOUNDARY VARIANTS + # For block i, exclude the last variant (potential overlap with next block) + # For block j, exclude the first variant (potential overlap with previous block) + block_i_vars <- variant_ids[block_i_start:(block_i_end - 1)] + block_j_vars <- variant_ids[(block_j_start + 1):block_j_end] + + # Only check if there are variants in both blocks + if (length(block_i_vars) > 0 && length(block_j_vars) > 0) { + # Extract the cross-block submatrix for these variants + cross_block_unique <- matrix[block_i_vars, block_j_vars, drop = FALSE] + + # Check if any values are non-zero + max_value <- max(abs(cross_block_unique)) + if (max_value > 1e-10) { + validation_failed <- TRUE + validation_message <- c( + validation_message, + paste( + "Non-zero correlation detected between unique variants of blocks", i, "and", j, + "- Max value:", max_value + ) + ) } } } } - if (length(msgs) > 0) stop("Matrix lacks expected block structure:\n", paste(msgs, collapse = "\n")) -} -#' @noRd + # If validation failed, stop execution with error message + if (validation_failed) { + stop( + "Matrix does not have the expected block structure:\n", + paste(validation_message, collapse = "\n") + ) + } +} +# Helper function to merge small blocks +# Check if two blocks can be merged based on chromosome and size constraints can_merge <- function(block1, block2, max_size) { - block1$chrom == block2$chrom && (block1$size + block2$size) <= max_size + # Check if blocks are on the same chromosome + same_chrom <- block1$chrom == block2$chrom + + # Check if combined size is within limits + combined_size <- block1$size + block2$size + size_ok <- combined_size <= max_size + + return(same_chrom && size_ok) } -#' @noRd +# Merge two blocks in the metadata dataframe merge_two_blocks <- function(block_metadata, idx1, idx2) { - if (idx1 > idx2) { tmp <- idx1; idx1 <- idx2; idx2 <- tmp } + # Ensure idx1 < idx2 for consistent behavior + if (idx1 > idx2) { + temp <- idx1 + idx1 <- idx2 + idx2 <- temp + } + + # Create a copy to avoid modifying the input directly result <- block_metadata + + # Update the end index and size of the first block result$end_idx[idx1] <- block_metadata$end_idx[idx2] result$size[idx1] <- block_metadata$size[idx1] + block_metadata$size[idx2] + + # Remove the second block result <- result[-idx2, ] + + # Renumber block IDs result$block_id <- seq_len(nrow(result)) - result + + return(result) } -#' Find blocks below min_size and identify the best neighbor to merge with. -#' @noRd +# Find small blocks and their best merge candidates find_merge_candidates <- function(block_metadata, min_size, max_size) { - candidates <- data.frame(block_idx = integer(), merge_with = integer(), stringsAsFactors = FALSE) + candidates <- data.frame( + block_idx = integer(), + merge_with = integer(), + stringsAsFactors = FALSE + ) + for (i in seq_len(nrow(block_metadata))) { + # Skip if block size is adequate if (block_metadata$size[i] >= min_size) next - prev_ok <- i > 1 && can_merge(block_metadata[i, ], block_metadata[i - 1, ], max_size) - next_ok <- i < nrow(block_metadata) && can_merge(block_metadata[i, ], block_metadata[i + 1, ], max_size) - merge_with <- if (prev_ok && next_ok) { - if (block_metadata$size[i - 1] <= block_metadata$size[i + 1]) i - 1 else i + 1 - } else if (prev_ok) i - 1 - else if (next_ok) i + 1 - else next - candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = merge_with)) - } - candidates + + # Check previous block + prev_idx <- i - 1 + can_merge_prev <- prev_idx >= 1 && + can_merge(block_metadata[i, ], block_metadata[prev_idx, ], max_size) + + # Check next block + next_idx <- i + 1 + can_merge_next <- next_idx <= nrow(block_metadata) && + can_merge(block_metadata[i, ], block_metadata[next_idx, ], max_size) + + # Determine best merge option + if (can_merge_prev && can_merge_next) { + # Choose the smaller of the two to minimize impact + if (block_metadata$size[prev_idx] <= block_metadata$size[next_idx]) { + merge_with <- prev_idx + } else { + merge_with <- next_idx + } + candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = merge_with)) + } else if (can_merge_prev) { + candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = prev_idx)) + } else if (can_merge_next) { + candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = next_idx)) + } + # If no valid merge candidates, leave the block as is + } + + return(candidates) } -#' Iteratively merge blocks below min_size with their smallest neighbor. -#' @noRd +# Helper function to merge small blocks merge_blocks <- function(block_metadata, min_size, max_size) { - if (nrow(block_metadata) <= 1) return(block_metadata) - repeat { - candidates <- find_merge_candidates(block_metadata, min_size, max_size) - if (nrow(candidates) == 0) break - block_metadata <- merge_two_blocks(block_metadata, candidates$block_idx[1], candidates$merge_with[1]) + # If there's only one block or empty input, just return it + if (nrow(block_metadata) <= 1) { + return(block_metadata) } - block_metadata + + new_block_metadata <- block_metadata + made_changes <- TRUE + + # Iterate until no more merges are possible + while (made_changes) { + # Find all current merge candidates + candidates <- find_merge_candidates(new_block_metadata, min_size, max_size) + + # Stop if no candidates found + if (nrow(candidates) == 0) { + made_changes <- FALSE + break + } + + # Process the first candidate (we only do one merge per iteration to avoid index issues) + new_block_metadata <- merge_two_blocks( + new_block_metadata, + candidates$block_idx[1], + candidates$merge_with[1] + ) + } + + return(new_block_metadata) } # Helper function to extract block matrices diff --git a/man/extract_LD_for_region.Rd b/man/extract_LD_for_region.Rd new file mode 100644 index 00000000..a7e30f84 --- /dev/null +++ b/man/extract_LD_for_region.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LD.R +\name{extract_LD_for_region} +\alias{extract_LD_for_region} +\title{Extract LD matrix and variants for a specific region} +\usage{ +extract_LD_for_region(LD_matrix, variants, region, extract_coordinates) +} +\description{ +Extract LD matrix and variants for a specific region +} diff --git a/man/filter_variants_by_ld_reference.Rd b/man/filter_variants_by_ld_reference.Rd index 1454f645..15ec7757 100644 --- a/man/filter_variants_by_ld_reference.Rd +++ b/man/filter_variants_by_ld_reference.Rd @@ -11,18 +11,13 @@ filter_variants_by_ld_reference( ) } \arguments{ -\item{variant_ids}{variant names in the format chr:pos:ref:alt.} +\item{variant_ids}{variant names in the format chr:pos_ref_alt or chr:pos:ref:alt.} -\item{ld_reference_meta_file}{Path to LD metadata file or PLINK prefix.} - -\item{keep_indel}{Whether to keep indel variants. Default TRUE.} +\item{ld_reference_meta_file}{A data frame similar to 'genomic_data' in get_regional_ld_meta function.} } \value{ -A list with: - \item{data}{Character vector of filtered variant IDs.} - \item{idx}{Integer vector of indices into the original variant_ids.} +A subset of variants, filtered based on LD reference data. } \description{ -Filters a vector of variant IDs to those present in the LD reference panel. -Auto-detects the reference type (PLINK2, PLINK1, or pre-computed LD metadata). +Filter variants by LD Reference } diff --git a/man/load_LD_matrix.Rd b/man/load_LD_matrix.Rd index cc3903f2..ec7d5d82 100644 --- a/man/load_LD_matrix.Rd +++ b/man/load_LD_matrix.Rd @@ -4,45 +4,29 @@ \alias{load_LD_matrix} \title{Load and Process Linkage Disequilibrium (LD) Matrix} \usage{ -load_LD_matrix( - LD_meta_file_path, - region, - extract_coordinates = NULL, - return_genotype = FALSE, - n_sample = NULL -) +load_LD_matrix(LD_meta_file_path, region, extract_coordinates = NULL) } \arguments{ -\item{LD_meta_file_path}{Path to LD metadata file, or prefix for PLINK genotype files.} +\item{LD_meta_file_path}{path of LD_metadata, LD_metadata is a data frame specifying LD blocks with +columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks. +"path" is the path of each LD block, optionally including bim file paths.} -\item{region}{Region of interest: "chr:start-end" string or data.frame with chrom/start/end.} +\item{region}{A data frame specifying region of interest with columns "chrom", "start", and "end".} -\item{extract_coordinates}{Optional data.frame with columns "chrom" and "pos" for -specific coordinates extraction (only for pre-computed LD blocks).} - -\item{return_genotype}{If TRUE, return the genotype matrix X instead of the LD -correlation matrix R. Only valid for PLINK genotype sources.} - -\item{n_sample}{Optional sample size for computing variance (= 2*p*(1-p)*n/(n-1)). -If NULL, ref_panel will not include variance or n_nomiss columns. -Only used for PLINK genotype sources.} +\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.} } \value{ -A list with: +A list of processed LD matrices and associated variant data frames for region of interest. +Each element of the list contains: \describe{ - \item{LD_variants}{Character vector of variant IDs (canonical format).} - \item{LD_matrix}{Symmetric LD correlation matrix (or genotype matrix if return_genotype=TRUE).} - \item{ref_panel}{Data.frame with variant metadata (chrom, pos, A2, A1, variant_id, - and optionally allele_freq, variance, n_nomiss).} - \item{block_metadata}{Data.frame with block info (block_id, chrom, block_start, block_end, size, start_idx, end_idx).} +\item{combined_LD_variants}{A data frame merging selected variants within each LD block in bim file format with columns "chrom", "variants", +"GD", "pos", "A1", and "A2".} +\item{combined_LD_matrix}{The LD matrix for each region, with row and column names matching variant identifiers.} +\item{block_indices}{A data frame tracking the indices of variants in the combined matrix by LD block, +with columns "block_id", "start_idx", "end_idx", "chrom", "block_start", "block_end" to facilitate +further partitioning if needed.} } } \description{ -Unified entry point for loading LD data. Auto-detects the source type: -\itemize{ - \item Pre-computed LD blocks (metadata file with chrom/start/end/path columns) - \item PLINK2 genotype files (.pgen/.pvar[.zst]/.psam + .afreq) - \item PLINK1 genotype files (.bed/.bim/.fam) -} -For PLINK genotype sources, LD is computed on the fly via \code{compute_LD()}. +Load and Process Linkage Disequilibrium (LD) Matrix }