From f4f41eaf29823ea81acb9f842940bfb57a805f93 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:49:18 -0400
Subject: [PATCH 1/3] Update LD.R

empty LD block issue
max(integer) return Inf
---
 R/LD.R | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/R/LD.R b/R/LD.R
index 610dd7af..014c7cda 100644
--- a/R/LD.R
+++ b/R/LD.R
@@ -348,6 +348,21 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
     }
   }
 
+  # Filter out empty blocks before combining
+  non_empty <- sapply(extracted_LD_variants_list, function(v) nrow(v) > 0)
+  if (!any(non_empty)) {
+    stop("No variants found in any LD block for the specified region.")
+  }
+  if (any(!non_empty)) {
+    message(paste(
+      "Removing", sum(!non_empty), "empty LD block(s) with no variants in the region."
+    ))
+    extracted_LD_matrices_list <- extracted_LD_matrices_list[non_empty]
+    extracted_LD_variants_list <- extracted_LD_variants_list[non_empty]
+    block_chroms <- block_chroms[non_empty]
+    LD_file_paths <- LD_file_paths[non_empty]
+  }
+
   # Create combined LD matrix
   combined_LD_matrix <- create_combined_LD_matrix(
     LD_matrices = extracted_LD_matrices_list,
@@ -439,10 +454,6 @@ filter_variants_by_ld_reference <- function(variant_ids, ld_reference_meta_file,
 
 #' Partition LD Matrix into Block-Specific Matrices
 #'
-#' This function takes the output from load_LD_matrix and partitions the combined LD matrix
-#' into a list of smaller matrices based on the block_indices, making it easier to work with
-#' large LD matrices that span multiple blocks.
-#'
 #' @param ld_data A list as returned by load_LD_matrix, containing combined_LD_matrix,
 #'                combined_LD_variants, ref_panel, and block_metadata.
 #' @param merge_small_blocks Logical, whether to merge blocks smaller than min_merged_block_size (default: TRUE).
@@ -475,6 +486,30 @@ partition_LD_matrix <- function(ld_data, merge_small_blocks = TRUE,
     colnames(combined_matrix) <- variant_ids
   }
 
+  # Filter out blocks with invalid indices (empty blocks, out-of-range, NA, Inf)
+  n_variants <- length(variant_ids)
+  valid_blocks <- sapply(seq_len(nrow(block_metadata)), function(i) {
+    s <- block_metadata$start_idx[i]
+    e <- block_metadata$end_idx[i]
+    sz <- block_metadata$size[i]
+    # Block is valid if: size > 0, indices are finite integers, and within range
+    !is.na(s) && !is.na(e) && is.finite(s) && is.finite(e) &&
+      sz > 0 && s >= 1 && e >= s && e <= n_variants
+  })
+
+  if (!any(valid_blocks)) {
+    stop("No valid LD blocks found. All block indices are out of range or empty.")
+  }
+
+  if (any(!valid_blocks)) {
+    message(paste(
+      "Removing", sum(!valid_blocks),
+      "LD block(s) with invalid or out-of-range indices."
+    ))
+    block_metadata <- block_metadata[valid_blocks, , drop = FALSE]
+    block_metadata$block_id <- seq_len(nrow(block_metadata))
+  }
+
   # Validate the block structure of the matrix (skip if only one block)
   if (nrow(block_metadata) > 1) {
     validate_block_structure(combined_matrix, block_metadata, variant_ids)

From e9fee129cc573507a0c4f472f2ea90c479271ace Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:51:41 -0400
Subject: [PATCH 2/3] Create LD.R

fix error for missing variant in LD blocks
---
 R/LD.R | 746 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 746 insertions(+)
 create mode 100644 R/LD.R

diff --git a/R/LD.R b/R/LD.R
new file mode 100644
index 00000000..014c7cda
--- /dev/null
+++ b/R/LD.R
@@ -0,0 +1,746 @@
+#' Function to Check if Regions are in increasing order and remove duplicated rows
+#' @importFrom dplyr distinct arrange group_by mutate ungroup
+#' @importFrom magrittr %>%
+#' @noRd
+order_dedup_regions <- function(df) {
+  # Ensure that 'chrom' values are integers, df can be genomic_data or regions_of_interest
+  df$chrom <- ifelse(grepl("^chr", df$chrom),
+    as.integer(sub("^chr", "", df$chrom)), # Remove 'chr' and convert to integer
+    as.integer(df$chrom)
+  ) # Convert to integer if not already
+
+  # Remove duplicated rows based on 'chrom' and 'start' columns
+  df <- distinct(df, chrom, start, .keep_all = TRUE) %>%
+    arrange(chrom, start)
+
+  # for (chr in unique(df$chrom)) {
+  #  chr_rows <- which(df$chrom == chr)
+  #  if (length(chr_rows) > 1) {
+  #    starts <- df$start[chr_rows]
+  #    for (i in 2:length(starts)) {
+  #      if (starts[i] < starts[i - 1]) {
+  #        stop("The input list of regions is not in increasing order within each chromosome.")
+  #      }
+  #    }
+  #  }
+  # }
+
+  return(df)
+}
+
+#' Function to Find Start and End Rows of Genomic Data for Region of Interest
+#' @importFrom dplyr filter arrange slice
+#' @noRd
+find_intersection_rows <- function(genomic_data, region_chrom, region_start, region_end) {
+  # Filter for the specific chromosome
+  chrom_data <- genomic_data %>% filter(chrom == region_chrom)
+
+  min_start <- if (nrow(chrom_data) > 0) min(chrom_data$start) else NA
+  max_end <- if (nrow(chrom_data) > 0) max(chrom_data$end) else NA
+
+  # Adjust region bounds if they're outside available data range
+  if (!is.na(min_start) && region_start < min_start) {
+    region_start <- min_start
+  }
+
+  if (!is.na(max_end) && region_end > max_end) {
+    region_end <- max_end
+  }
+
+  # Try to find rows that cover the region start and end
+  start_row <- genomic_data %>%
+    filter(chrom == region_chrom, start <= region_start, end > region_start) %>%
+    slice(1)
+
+  end_row <- genomic_data %>%
+    filter(chrom == region_chrom, start < region_end, end >= region_end) %>%
+    arrange(desc(end)) %>%
+    slice(1)
+
+  if (nrow(start_row) == 0 || nrow(end_row) == 0) {
+    stop("Region of interest is not covered by any rows in the data frame.")
+  }
+
+  list(start_row = start_row, end_row = end_row)
+}
+
+#' Function to Validate Selected Region
+#' @noRd
+validate_selected_region <- function(start_row, end_row, region_start, region_end) {
+  if (!(start_row$start <= region_start && end_row$end >= region_end)) {
+    stop("The selected region is not fully covered by the merged region.")
+  }
+}
+
+#' Extract File Paths Based on Intersection Criteria
+#' @noRd
+extract_file_paths <- function(genomic_data, intersection_rows, column_to_extract) {
+  # Ensure the file_path_column exists in genomic_data
+  if (!column_to_extract %in% names(genomic_data)) {
+    stop(paste("Column", column_to_extract, "not found in genomic data"))
+  }
+  # Extract rows based on intersection criteria
+  extracted_paths <- genomic_data[[column_to_extract]][which(
+    genomic_data$chrom == intersection_rows$start_row$chrom &
+      genomic_data$start >= intersection_rows$start_row$start &
+      genomic_data$start <= intersection_rows$end_row$start
+  )]
+
+  # Return the extracted paths
+  return(extracted_paths)
+}
+
+#' Intersect LD reference with Regions of Interest
+#'
+#' @param ld_reference_meta_file A file of data frame with columns "chrom", "start", "end", and "path" representing genomic regions.
+#' "chrom" is the chromosome, "start" and "end" are the positions of the LD block, and "path" is the file path for the LD block.
+#' @param region A data frame with columns "chrom", "start", and "end" specifying regions of interest.
+#' "start" and "end" are the positions of these regions. Or it can take the form of `chr:start-end`
+#'
+#' @return A list containing processed genomic data, region of interest, and a detailed result list.
+#' The result list contains, for each region:
+#' - The start and end row indices in the genomic data
+#' - File paths from the genomic data corresponding to intersected regions
+#' - Optionally, bim file paths if available
+#' @importFrom stringr str_split
+#' @importFrom dplyr select
+#' @importFrom vroom vroom
+#' @noRd
+get_regional_ld_meta <- function(ld_reference_meta_file, region, complete_coverage_required = FALSE) {
+  genomic_data <- vroom(ld_reference_meta_file)
+  region <- parse_region(region)
+  # Set column names
+  names(genomic_data) <- c("chrom", "start", "end", "path")
+  names(region) <- c("chrom", "start", "end")
+
+  # Order and deduplicate regions
+  genomic_data <- order_dedup_regions(genomic_data)
+  region <- order_dedup_regions(region)
+
+  # Process file paths
+  file_path <- genomic_data$path %>%
+    str_split(",", simplify = TRUE) %>%
+    data.frame() %>%
+    `colnames<-`(if (ncol(.) == 2) c("LD_file_path", "bim_file_path") else c("LD_file_path"))
+
+  genomic_data <- cbind(genomic_data, file_path) %>% select(-path)
+
+  # Find intersection rows
+  intersection_rows <- find_intersection_rows(genomic_data, region$chrom, region$start, region$end)
+
+  # Validate region
+  if (complete_coverage_required) {
+    validate_selected_region(intersection_rows$start_row, intersection_rows$end_row, region$start, region$end)
+  }
+
+  # Extract file paths
+  LD_paths <- find_valid_file_paths(ld_reference_meta_file, extract_file_paths(genomic_data, intersection_rows, "LD_file_path"))
+  bim_paths <- if ("bim_file_path" %in% names(genomic_data)) {
+    find_valid_file_paths(ld_reference_meta_file, extract_file_paths(genomic_data, intersection_rows, "bim_file_path"))
+  } else {
+    NULL
+  }
+
+  return(list(
+    intersections = list(
+      start_index = intersection_rows$start_row,
+      end_index = intersection_rows$end_row,
+      LD_file_paths = LD_paths,
+      bim_file_paths = bim_paths
+    ),
+    ld_meta_data = genomic_data,
+    region = region
+  ))
+}
+
+#' @importFrom dplyr mutate
+#' @importFrom utils read.table
+#' @importFrom stats setNames
+# Process an LD matrix from a file path
+process_LD_matrix <- function(LD_file_path, bim_file_path) {
+  # Order the variants by position
+  order_variants_by_position <- function(strings) {
+    # Apply the function to each variant to get a vector of positions
+    positions <- sapply(strings, function(variant) as.integer(strsplit(variant, ":")[[1]][2]))
+    # Check whether the merged variants is orderd
+    # when diff() returns 0, it is multiallelic at the position (same position but >1 variations)
+    if (!all(diff(positions[order(positions)]) >= 0)) {
+      stop("The positions are not in non-decreasing order")
+    }
+    # Order the variants by position
+    strings_ordered <- strings[order(positions)]
+    return(strings_ordered)
+  }
+  # Read the LD matrix
+  LD_file_con <- xzfile(LD_file_path)
+  LD_matrix <- scan(LD_file_con, quiet = TRUE)
+  close(LD_file_con)
+  LD_matrix <- matrix(LD_matrix, ncol = sqrt(length(LD_matrix)), byrow = TRUE)
+
+  # Process the bim file to extract variant information
+  bim_file_name <- if (!is.null(bim_file_path)) {
+    bim_file_path
+  } else {
+    paste0(LD_file_path, ".bim", sep = "")
+  }
+
+  # Process variant names from file paths
+  LD_variants <- read.table(bim_file_name)
+  if (ncol(LD_variants) == 9) {
+    LD_variants <- LD_variants %>%
+      setNames(c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")) %>%
+      mutate(chrom = as.character(as.integer(sub("^chr", "", chrom)))) %>%
+      mutate(variants = normalize_variant_id(variants))
+  } else if (ncol(LD_variants) == 6) {
+    LD_variants <- LD_variants %>%
+      setNames(c("chrom", "variants", "GD", "pos", "A1", "A2")) %>%
+      mutate(chrom = as.character(as.integer(sub("^chr", "", chrom)))) %>%
+      mutate(variants = normalize_variant_id(variants))
+  } else {
+    stop("Unexpected number of columns in the input file.")
+  }
+
+
+  # Set column and row names of the LD matrix
+  colnames(LD_matrix) <- rownames(LD_matrix) <- LD_variants$variants
+  # Check if the matrix is upper diagonal
+  # We assume a matrix is upper diagonal if all elements below the main diagonal are zero
+  is_upper_diagonal <- all(LD_matrix[lower.tri(LD_matrix)] == 0)
+  if (is_upper_diagonal) {
+    # If the matrix is upper diagonal, transpose the upper triangle to the lower triangle
+    LD_matrix[lower.tri(LD_matrix)] <- t(LD_matrix)[lower.tri(LD_matrix)]
+  } else {
+    # If the matrix is lower diagonal, transpose the lower triangle to the upper triangle
+    LD_matrix[upper.tri(LD_matrix)] <- t(LD_matrix)[upper.tri(LD_matrix)]
+  }
+  LD_variants_ordered <- LD_variants[match(order_variants_by_position(LD_variants$variants), LD_variants$variants), ]
+  LD_matrix <- LD_matrix[match(LD_variants_ordered$variants, rownames(LD_matrix)), match(LD_variants_ordered$variants, rownames(LD_matrix))]
+  list(LD_matrix = LD_matrix, LD_variants = LD_variants_ordered)
+}
+
+#' Extract LD matrix and variants for a specific region
+#' @importFrom dplyr mutate select
+#' @importFrom magrittr %>%
+#' @importFrom utils tail
+extract_LD_for_region <- function(LD_matrix, variants, region, extract_coordinates) {
+  # Filter variants based on region
+  extracted_LD_variants <- subset(variants, chrom == region$chrom & pos >= region$start & pos <= region$end)
+  if (!is.null(extract_coordinates)) {
+    # Preprocess 'extract_coordinate' to ensure 'chrom' is numeric and without 'chr'
+    extract_coordinates <- extract_coordinates %>%
+      mutate(chrom = ifelse(grepl("^chr", chrom), as.integer(sub("^chr", "", chrom)), chrom)) %>%
+      select(chrom, pos)
+    # Now merge with 'LD_variants_region_selected'
+    extracted_LD_variants <- extracted_LD_variants %>%
+      # Ensure that 'chrom' values in 'LD_variants_region_selected' are numeric, remove 'chr' if present
+      mutate(chrom = ifelse(grepl("^chr", chrom), as.integer(sub("^chr", "", chrom)), as.integer(chrom))) %>%
+      # Merge with 'extract_coordinate' after 'chrom' adjustment
+      merge(extract_coordinates, by = c("chrom", "pos"))
+    # Select the desired columns, assuming 'variants' column is equivalent to the 'variants' in 'LD_variants_region_selected'
+    # Select columns dynamically based on the presence of 'variance'
+    cols_to_select <- c("chrom", "variants", "pos", "GD", "A1", "A2") # select(chrom, variants, pos, GD, A1, A2)
+    if ("variance" %in% names(extracted_LD_variants)) {
+      cols_to_select <- c(cols_to_select, "variance")
+    }
+    extracted_LD_variants <- select(extracted_LD_variants, all_of(cols_to_select))
+  }
+  # Extract LD matrix
+  extracted_LD_matrix <- LD_matrix[extracted_LD_variants$variants, extracted_LD_variants$variants, drop = FALSE]
+  list(extracted_LD_matrix = extracted_LD_matrix, extracted_LD_variants = extracted_LD_variants)
+}
+
+# Create a combined LD matrix from multiple matrices
+create_combined_LD_matrix <- function(LD_matrices, variants) {
+  # Updated mergeVariants helper that checks the structure of each element.
+  mergeVariants <- function(LD_variants_list) {
+    mergedVariants <- character(0)
+    # Loop over the list of variant information
+    for (LD_variants in LD_variants_list) {
+      currentVariants <- if (is.list(LD_variants) && !is.null(LD_variants$variants)) {
+        LD_variants$variants
+      } else {
+        LD_variants
+      }
+
+      if (length(currentVariants) == 0) next
+
+      # If the last variant in mergedVariants is the same as the first of currentVariants, skip the duplicate
+      if (length(mergedVariants) > 0 && tail(mergedVariants, 1) == currentVariants[1]) {
+        mergedVariants <- c(mergedVariants, currentVariants[-1])
+      } else {
+        mergedVariants <- c(mergedVariants, currentVariants)
+      }
+    }
+    return(mergedVariants)
+  }
+
+  unique_variants <- mergeVariants(variants)
+  # Initialize an empty combined LD matrix with the unique variants
+  combined_LD_matrix <- matrix(0, nrow = length(unique_variants), ncol = length(unique_variants))
+  rownames(combined_LD_matrix) <- unique_variants
+  colnames(combined_LD_matrix) <- unique_variants
+
+  # Function to align the values from each LD matrix to the combined matrix
+  align_matrix <- function(ld_matrix, combined_matrix, variant_names) {
+    indices <- match(variant_names, rownames(combined_matrix))
+    combined_matrix[indices, indices] <- ld_matrix
+    return(combined_matrix)
+  }
+
+  # Use Map to pair each LD matrix with its rownames, then Reduce to fill the combined matrix.
+  combined_LD_matrix <- Reduce(
+    function(x, y) align_matrix(y[[1]], x, y[[2]]),
+    Map(list, LD_matrices, lapply(LD_matrices, rownames)),
+    combined_LD_matrix
+  )
+
+  return(combined_LD_matrix)
+}
+
+#' Load and Process Linkage Disequilibrium (LD) Matrix
+#'
+#' @param LD_meta_file_path path of LD_metadata, LD_metadata is a data frame specifying LD blocks with
+#' columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks.
+#' "path" is the path of each LD block, optionally including bim file paths.
+#' @param region A data frame specifying region of interest with columns "chrom", "start", and "end".
+#' @param extract_coordinates Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.
+#'
+#' @return A list of processed LD matrices and associated variant data frames for region of interest.
+#' Each element of the list contains:
+#' \describe{
+#' \item{combined_LD_variants}{A data frame merging selected variants within each LD block in bim file format with columns "chrom", "variants",
+#' "GD", "pos", "A1", and "A2".}
+#' \item{combined_LD_matrix}{The LD matrix for each region, with row and column names matching variant identifiers.}
+#' \item{block_indices}{A data frame tracking the indices of variants in the combined matrix by LD block,
+#' with columns "block_id", "start_idx", "end_idx", "chrom", "block_start", "block_end" to facilitate
+#' further partitioning if needed.}
+#' }
+#' @export
+load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL) {
+  # Intersect LD metadata with specified regions using updated function
+  intersected_LD_files <- get_regional_ld_meta(LD_meta_file_path, region)
+
+  # Extract file paths for LD and bim files
+  LD_file_paths <- intersected_LD_files$intersections$LD_file_paths
+  bim_file_paths <- intersected_LD_files$intersections$bim_file_paths
+
+  # Using a for loop here to allow for rm() in each loop to save memory
+  extracted_LD_matrices_list <- list()
+  extracted_LD_variants_list <- list()
+  block_chroms <- character(length(LD_file_paths))
+
+  # Process each LD block individually
+  for (j in seq_along(LD_file_paths)) {
+    LD_matrix_processed <- process_LD_matrix(LD_file_paths[j], bim_file_paths[j])
+    extracted_LD_list <- extract_LD_for_region(
+      LD_matrix = LD_matrix_processed$LD_matrix,
+      variants = LD_matrix_processed$LD_variants,
+      region = intersected_LD_files$region,
+      extract_coordinates = extract_coordinates
+    )
+    extracted_LD_matrices_list[[j]] <- extracted_LD_list$extracted_LD_matrix
+    extracted_LD_variants_list[[j]] <- extracted_LD_list$extracted_LD_variants
+    if (nrow(extracted_LD_variants_list[[j]]) > 0) {
+      block_chroms[j] <- as.character(extracted_LD_variants_list[[j]]$chrom[1])
+    } else {
+      # Use region chromosome as fallback for empty blocks
+      block_chroms[j] <- as.character(intersected_LD_files$region$chrom)
+    }
+  }
+
+  # Filter out empty blocks before combining
+  non_empty <- sapply(extracted_LD_variants_list, function(v) nrow(v) > 0)
+  if (!any(non_empty)) {
+    stop("No variants found in any LD block for the specified region.")
+  }
+  if (any(!non_empty)) {
+    message(paste(
+      "Removing", sum(!non_empty), "empty LD block(s) with no variants in the region."
+    ))
+    extracted_LD_matrices_list <- extracted_LD_matrices_list[non_empty]
+    extracted_LD_variants_list <- extracted_LD_variants_list[non_empty]
+    block_chroms <- block_chroms[non_empty]
+    LD_file_paths <- LD_file_paths[non_empty]
+  }
+
+  # Create combined LD matrix
+  combined_LD_matrix <- create_combined_LD_matrix(
+    LD_matrices = extracted_LD_matrices_list,
+    variants = extracted_LD_variants_list
+  )
+  # Variants are already in canonical format (chr prefix) from process_LD_matrix / normalize_variant_id
+  combined_LD_variants <- rownames(combined_LD_matrix)
+
+  # Build block metadata — variants already in canonical format, no normalization needed
+  block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants)
+  block_metadata <- data.frame(
+    block_id = seq_along(LD_file_paths),
+    chrom = block_chroms,
+    size = sapply(block_variants, length),
+    start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))),
+    end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))),
+    stringsAsFactors = FALSE
+  )
+
+  # Remove large objects to free memory
+  rm(extracted_LD_matrices_list)
+
+  # Create reference panel from canonical variant IDs
+  ref_panel <- parse_variant_id(rownames(combined_LD_matrix))
+  merged_variant_list <- do.call(rbind, extracted_LD_variants_list)
+  ref_panel$variant_id <- rownames(combined_LD_matrix)
+
+  if ("variance" %in% colnames(merged_variant_list)) {
+    ref_panel$variance <- merged_variant_list$variance[match(rownames(combined_LD_matrix), merged_variant_list$variants)]
+  }
+
+  # Return the combined LD list
+  combined_LD_list <- list(
+    combined_LD_variants = combined_LD_variants,
+    combined_LD_matrix = combined_LD_matrix,
+    ref_panel = ref_panel,
+    block_metadata = block_metadata
+  )
+
+  return(combined_LD_list)
+}
+
+#' Filter variants by LD Reference
+#'
+#' @param variant_ids variant names in the format chr:pos_ref_alt or chr:pos:ref:alt.
+#' @param ld_reference_meta_file A data frame similar to 'genomic_data' in get_regional_ld_meta function.
+#' @return A subset of variants, filtered based on LD reference data.
+#' @importFrom stringr str_split
+#' @importFrom dplyr select group_by summarise
+#' @importFrom vroom vroom
+#' @export
+filter_variants_by_ld_reference <- function(variant_ids, ld_reference_meta_file, keep_indel = TRUE) {
+  # Step 1: Process variant IDs into a data frame and filter out non-standard nucleotides
+  variants_df <- do.call(rbind, lapply(strsplit(variant_ids, ":"), function(x) {
+    data.frame(chrom = x[1], pos = as.integer(x[2]), ref = x[3], alt = x[4])
+  }))
+
+  variants_df$chrom <- ifelse(grepl("^chr", variants_df$chrom),
+    as.integer(sub("^chr", "", variants_df$chrom)), # Remove 'chr' and convert to integer
+    as.integer(variants_df$chrom)
+  )
+  # Step 2: Derive region information from the variants data frame
+  region_df <- variants_df %>%
+    group_by(chrom) %>%
+    summarise(start = min(pos), end = max(pos))
+  # Step 3: Call get_regional_ld_meta to get bim_file_paths
+  bim_file_paths <- get_regional_ld_meta(ld_reference_meta_file, region_df)$intersections$bim_file_paths
+
+  # Step 4: Load bim files and consolidate into a single data frame
+  bim_data <- lapply(bim_file_paths, function(path) {
+    bim_df <- vroom(path, col_names = FALSE)
+    data.frame(chrom = bim_df$X1, pos = bim_df$X4, stringsAsFactors = FALSE)
+  }) %>%
+    do.call("rbind", .)
+
+  # Step 5: Overlap the variants data frame with bim_data
+  keep_indices <- which(paste(variants_df$chrom, variants_df$pos) %in% paste(bim_data$chrom, bim_data$pos))
+  if (!keep_indel) {
+    valid_nucleotides <- c("A", "T", "C", "G")
+    snp_idx <- which((variants_df$ref %in% valid_nucleotides) & (variants_df$alt %in% valid_nucleotides))
+    keep_indices <- intersect(keep_indices, snp_idx)
+  }
+  variants_filtered <- variant_ids[keep_indices]
+
+  message(length(variant_ids) - length(keep_indices), " out of ", length(variant_ids), " total variants dropped due to absence on the reference LD panel.")
+
+  return(list(data = variants_filtered, idx = keep_indices))
+}
+
+#' Partition LD Matrix into Block-Specific Matrices
+#'
+#' @param ld_data A list as returned by load_LD_matrix, containing combined_LD_matrix,
+#'                combined_LD_variants, ref_panel, and block_metadata.
+#' @param merge_small_blocks Logical, whether to merge blocks smaller than min_merged_block_size (default: TRUE).
+#' @param min_merged_block_size Integer, minimum number of variants for a block after merging (default: 500).
+#' @param max_merged_block_size Integer, maximum number of variants in a block after merging (default: 10000).
+#'
+#' @return returns a list containing:
+#' \describe{
+#' \item{ld_matrices}{A list of matrices, each representing LD for a specific block.}
+#' \item{variant_indices}{A data frame that maps variant IDs to their corresponding block.}
+#' \item{block_metadata}{Information about each block including size, chromosome, start and end positions.}
+#' }
+#' @noRd
+partition_LD_matrix <- function(ld_data, merge_small_blocks = TRUE,
+                                min_merged_block_size = 500, max_merged_block_size = 10000) {
+  # Extract components from ld_data
+  combined_matrix <- ld_data$combined_LD_matrix
+  block_metadata <- ld_data$block_metadata
+  variant_ids <- ld_data$combined_LD_variants
+
+  # Error if matrix is empty
+  if (is.null(combined_matrix) || nrow(combined_matrix) == 0 || ncol(combined_matrix) == 0) {
+    stop("Empty or NULL LD matrix provided.")
+  }
+
+  # Ensure the row and column names of the matrix match the variant_ids
+  if (is.null(rownames(combined_matrix)) || is.null(colnames(combined_matrix)) ||
+    !identical(rownames(combined_matrix), variant_ids) || !identical(colnames(combined_matrix), variant_ids)) {
+    rownames(combined_matrix) <- variant_ids
+    colnames(combined_matrix) <- variant_ids
+  }
+
+  # Filter out blocks with invalid indices (empty blocks, out-of-range, NA, Inf)
+  n_variants <- length(variant_ids)
+  valid_blocks <- sapply(seq_len(nrow(block_metadata)), function(i) {
+    s <- block_metadata$start_idx[i]
+    e <- block_metadata$end_idx[i]
+    sz <- block_metadata$size[i]
+    # Block is valid if: size > 0, indices are finite integers, and within range
+    !is.na(s) && !is.na(e) && is.finite(s) && is.finite(e) &&
+      sz > 0 && s >= 1 && e >= s && e <= n_variants
+  })
+
+  if (!any(valid_blocks)) {
+    stop("No valid LD blocks found. All block indices are out of range or empty.")
+  }
+
+  if (any(!valid_blocks)) {
+    message(paste(
+      "Removing", sum(!valid_blocks),
+      "LD block(s) with invalid or out-of-range indices."
+    ))
+    block_metadata <- block_metadata[valid_blocks, , drop = FALSE]
+    block_metadata$block_id <- seq_len(nrow(block_metadata))
+  }
+
+  # Validate the block structure of the matrix (skip if only one block)
+  if (nrow(block_metadata) > 1) {
+    validate_block_structure(combined_matrix, block_metadata, variant_ids)
+  }
+
+  # Optionally merge small blocks
+  if (merge_small_blocks && any(block_metadata$size < min_merged_block_size) && nrow(block_metadata) > 1) {
+    block_metadata <- merge_blocks(block_metadata, min_merged_block_size, max_merged_block_size)
+  }
+
+  # Partition the matrix based on block metadata
+  result <- extract_block_matrices(combined_matrix, block_metadata, variant_ids)
+  return(result)
+}
+
+# Helper function to validate block structure
+validate_block_structure <- function(matrix, block_metadata, variant_ids) {
+  validation_failed <- FALSE
+  validation_message <- character(0)
+
+  # Iterate over all pairs of distinct blocks
+  for (i in 1:(nrow(block_metadata) - 1)) {
+    for (j in (i + 1):nrow(block_metadata)) {
+      # Get indices for each block
+      block_i_start <- block_metadata$start_idx[i]
+      block_i_end <- block_metadata$end_idx[i]
+      block_j_start <- block_metadata$start_idx[j]
+      block_j_end <- block_metadata$end_idx[j]
+
+      # Check if indices are valid
+      if (block_i_start > length(variant_ids) || block_i_end > length(variant_ids) ||
+        block_j_start > length(variant_ids) || block_j_end > length(variant_ids)) {
+        validation_failed <- TRUE
+        validation_message <- c(
+          validation_message,
+          paste("Block indices are out of range for blocks", i, "and", j)
+        )
+        next
+      }
+
+      # Get variants for each block, EXCLUDING THE BOUNDARY VARIANTS
+      # For block i, exclude the last variant (potential overlap with next block)
+      # For block j, exclude the first variant (potential overlap with previous block)
+      block_i_vars <- variant_ids[block_i_start:(block_i_end - 1)]
+      block_j_vars <- variant_ids[(block_j_start + 1):block_j_end]
+
+      # Only check if there are variants in both blocks
+      if (length(block_i_vars) > 0 && length(block_j_vars) > 0) {
+        # Extract the cross-block submatrix for these variants
+        cross_block_unique <- matrix[block_i_vars, block_j_vars, drop = FALSE]
+
+        # Check if any values are non-zero
+        max_value <- max(abs(cross_block_unique))
+        if (max_value > 1e-10) {
+          validation_failed <- TRUE
+          validation_message <- c(
+            validation_message,
+            paste(
+              "Non-zero correlation detected between unique variants of blocks", i, "and", j,
+              "- Max value:", max_value
+            )
+          )
+        }
+      }
+    }
+  }
+
+  # If validation failed, stop execution with error message
+  if (validation_failed) {
+    stop(
+      "Matrix does not have the expected block structure:\n",
+      paste(validation_message, collapse = "\n")
+    )
+  }
+}
+# Helper function to merge small blocks
+# Check if two blocks can be merged based on chromosome and size constraints
+can_merge <- function(block1, block2, max_size) {
+  # Check if blocks are on the same chromosome
+  same_chrom <- block1$chrom == block2$chrom
+
+  # Check if combined size is within limits
+  combined_size <- block1$size + block2$size
+  size_ok <- combined_size <= max_size
+
+  return(same_chrom && size_ok)
+}
+
+# Merge two blocks in the metadata dataframe
+merge_two_blocks <- function(block_metadata, idx1, idx2) {
+  # Ensure idx1 < idx2 for consistent behavior
+  if (idx1 > idx2) {
+    temp <- idx1
+    idx1 <- idx2
+    idx2 <- temp
+  }
+
+  # Create a copy to avoid modifying the input directly
+  result <- block_metadata
+
+  # Update the end index and size of the first block
+  result$end_idx[idx1] <- block_metadata$end_idx[idx2]
+  result$size[idx1] <- block_metadata$size[idx1] + block_metadata$size[idx2]
+
+  # Remove the second block
+  result <- result[-idx2, ]
+
+  # Renumber block IDs
+  result$block_id <- seq_len(nrow(result))
+
+  return(result)
+}
+
+# Find small blocks and their best merge candidates
+find_merge_candidates <- function(block_metadata, min_size, max_size) {
+  candidates <- data.frame(
+    block_idx = integer(),
+    merge_with = integer(),
+    stringsAsFactors = FALSE
+  )
+
+  for (i in seq_len(nrow(block_metadata))) {
+    # Skip if block size is adequate
+    if (block_metadata$size[i] >= min_size) next
+
+    # Check previous block
+    prev_idx <- i - 1
+    can_merge_prev <- prev_idx >= 1 &&
+      can_merge(block_metadata[i, ], block_metadata[prev_idx, ], max_size)
+
+    # Check next block
+    next_idx <- i + 1
+    can_merge_next <- next_idx <= nrow(block_metadata) &&
+      can_merge(block_metadata[i, ], block_metadata[next_idx, ], max_size)
+
+    # Determine best merge option
+    if (can_merge_prev && can_merge_next) {
+      # Choose the smaller of the two to minimize impact
+      if (block_metadata$size[prev_idx] <= block_metadata$size[next_idx]) {
+        merge_with <- prev_idx
+      } else {
+        merge_with <- next_idx
+      }
+      candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = merge_with))
+    } else if (can_merge_prev) {
+      candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = prev_idx))
+    } else if (can_merge_next) {
+      candidates <- rbind(candidates, data.frame(block_idx = i, merge_with = next_idx))
+    }
+    # If no valid merge candidates, leave the block as is
+  }
+
+  return(candidates)
+}
+
+# Helper function to merge small blocks
+merge_blocks <- function(block_metadata, min_size, max_size) {
+  # If there's only one block or empty input, just return it
+  if (nrow(block_metadata) <= 1) {
+    return(block_metadata)
+  }
+
+  new_block_metadata <- block_metadata
+  made_changes <- TRUE
+
+  # Iterate until no more merges are possible
+  while (made_changes) {
+    # Find all current merge candidates
+    candidates <- find_merge_candidates(new_block_metadata, min_size, max_size)
+
+    # Stop if no candidates found
+    if (nrow(candidates) == 0) {
+      made_changes <- FALSE
+      break
+    }
+
+    # Process the first candidate (we only do one merge per iteration to avoid index issues)
+    new_block_metadata <- merge_two_blocks(
+      new_block_metadata,
+      candidates$block_idx[1],
+      candidates$merge_with[1]
+    )
+  }
+
+  return(new_block_metadata)
+}
+
+# Helper function to extract block matrices
+extract_block_matrices <- function(matrix, block_metadata, variant_ids) {
+  ld_matrices <- list()
+  variant_mapping <- data.frame(
+    variant_id = character(),
+    block_id = integer(),
+    stringsAsFactors = FALSE
+  )
+
+  for (i in seq_len(nrow(block_metadata))) {
+    start_idx <- block_metadata$start_idx[i]
+    end_idx <- block_metadata$end_idx[i]
+
+    # Skip empty blocks
+    if (end_idx < start_idx) next
+
+    # Ensure indices are within bounds
+    if (start_idx > length(variant_ids) || end_idx > length(variant_ids)) {
+      warning(paste("Block", i, "has indices outside the range of variant_ids. Skipping."))
+      next
+    }
+
+    # Extract variant IDs for this block
+    block_variants <- variant_ids[start_idx:end_idx]
+
+    # Extract submatrix for this block - use named indexing
+    block_matrix <- matrix[block_variants, block_variants, drop = FALSE]
+
+    # Store in list
+    ld_matrices[[i]] <- block_matrix
+
+    # Update variant mapping
+    block_mapping <- data.frame(
+      variant_id = block_variants,
+      block_id = i,
+      stringsAsFactors = FALSE
+    )
+    variant_mapping <- rbind(variant_mapping, block_mapping)
+
+  }
+
+  return(list(
+    ld_matrices = ld_matrices,
+    variant_indices = variant_mapping,
+    block_metadata = block_metadata
+  ))
+}

From 6d14d27d55df8281d63c2c03e930c44f0c938424 Mon Sep 17 00:00:00 2001
From: xueweic <xueweic@users.noreply.github.com>
Date: Sat, 28 Mar 2026 12:53:41 +0000
Subject: [PATCH 3/3] Update documentation

---
 man/extract_LD_for_region.Rd           | 11 ++++++
 man/filter_variants_by_ld_reference.Rd | 13 +++-----
 man/load_LD_matrix.Rd                  | 46 +++++++++-----------------
 3 files changed, 30 insertions(+), 40 deletions(-)
 create mode 100644 man/extract_LD_for_region.Rd

diff --git a/man/extract_LD_for_region.Rd b/man/extract_LD_for_region.Rd
new file mode 100644
index 00000000..a7e30f84
--- /dev/null
+++ b/man/extract_LD_for_region.Rd
@@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/LD.R
+\name{extract_LD_for_region}
+\alias{extract_LD_for_region}
+\title{Extract LD matrix and variants for a specific region}
+\usage{
+extract_LD_for_region(LD_matrix, variants, region, extract_coordinates)
+}
+\description{
+Extract LD matrix and variants for a specific region
+}
diff --git a/man/filter_variants_by_ld_reference.Rd b/man/filter_variants_by_ld_reference.Rd
index 1454f645..15ec7757 100644
--- a/man/filter_variants_by_ld_reference.Rd
+++ b/man/filter_variants_by_ld_reference.Rd
@@ -11,18 +11,13 @@ filter_variants_by_ld_reference(
 )
 }
 \arguments{
-\item{variant_ids}{variant names in the format chr:pos:ref:alt.}
+\item{variant_ids}{variant names in the format chr:pos_ref_alt or chr:pos:ref:alt.}
 
-\item{ld_reference_meta_file}{Path to LD metadata file or PLINK prefix.}
-
-\item{keep_indel}{Whether to keep indel variants. Default TRUE.}
+\item{ld_reference_meta_file}{A data frame similar to 'genomic_data' in get_regional_ld_meta function.}
 }
 \value{
-A list with:
-  \item{data}{Character vector of filtered variant IDs.}
-  \item{idx}{Integer vector of indices into the original variant_ids.}
+A subset of variants, filtered based on LD reference data.
 }
 \description{
-Filters a vector of variant IDs to those present in the LD reference panel.
-Auto-detects the reference type (PLINK2, PLINK1, or pre-computed LD metadata).
+Filter variants by LD Reference
 }
diff --git a/man/load_LD_matrix.Rd b/man/load_LD_matrix.Rd
index cc3903f2..ec7d5d82 100644
--- a/man/load_LD_matrix.Rd
+++ b/man/load_LD_matrix.Rd
@@ -4,45 +4,29 @@
 \alias{load_LD_matrix}
 \title{Load and Process Linkage Disequilibrium (LD) Matrix}
 \usage{
-load_LD_matrix(
-  LD_meta_file_path,
-  region,
-  extract_coordinates = NULL,
-  return_genotype = FALSE,
-  n_sample = NULL
-)
+load_LD_matrix(LD_meta_file_path, region, extract_coordinates = NULL)
 }
 \arguments{
-\item{LD_meta_file_path}{Path to LD metadata file, or prefix for PLINK genotype files.}
+\item{LD_meta_file_path}{path of LD_metadata, LD_metadata is a data frame specifying LD blocks with
+columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks.
+"path" is the path of each LD block, optionally including bim file paths.}
 
-\item{region}{Region of interest: "chr:start-end" string or data.frame with chrom/start/end.}
+\item{region}{A data frame specifying region of interest with columns "chrom", "start", and "end".}
 
-\item{extract_coordinates}{Optional data.frame with columns "chrom" and "pos" for
-specific coordinates extraction (only for pre-computed LD blocks).}
-
-\item{return_genotype}{If TRUE, return the genotype matrix X instead of the LD
-correlation matrix R. Only valid for PLINK genotype sources.}
-
-\item{n_sample}{Optional sample size for computing variance (= 2*p*(1-p)*n/(n-1)).
-If NULL, ref_panel will not include variance or n_nomiss columns.
-Only used for PLINK genotype sources.}
+\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.}
 }
 \value{
-A list with:
+A list of processed LD matrices and associated variant data frames for region of interest.
+Each element of the list contains:
 \describe{
-  \item{LD_variants}{Character vector of variant IDs (canonical format).}
-  \item{LD_matrix}{Symmetric LD correlation matrix (or genotype matrix if return_genotype=TRUE).}
-  \item{ref_panel}{Data.frame with variant metadata (chrom, pos, A2, A1, variant_id,
-    and optionally allele_freq, variance, n_nomiss).}
-  \item{block_metadata}{Data.frame with block info (block_id, chrom, block_start, block_end, size, start_idx, end_idx).}
+\item{combined_LD_variants}{A data frame merging selected variants within each LD block in bim file format with columns "chrom", "variants",
+"GD", "pos", "A1", and "A2".}
+\item{combined_LD_matrix}{The LD matrix for each region, with row and column names matching variant identifiers.}
+\item{block_indices}{A data frame tracking the indices of variants in the combined matrix by LD block,
+with columns "block_id", "start_idx", "end_idx", "chrom", "block_start", "block_end" to facilitate
+further partitioning if needed.}
 }
 }
 \description{
-Unified entry point for loading LD data. Auto-detects the source type:
-\itemize{
-  \item Pre-computed LD blocks (metadata file with chrom/start/end/path columns)
-  \item PLINK2 genotype files (.pgen/.pvar[.zst]/.psam + .afreq)
-  \item PLINK1 genotype files (.bed/.bim/.fam)
-}
-For PLINK genotype sources, LD is computed on the fly via \code{compute_LD()}.
+Load and Process Linkage Disequilibrium (LD) Matrix
 }