diff --git a/.gitignore b/.gitignore index 4f1fdc64..9d0335a5 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ inst/doc *.log *.o *.so -*.dll \ No newline at end of file +*.dll +.lintr +.vscode \ No newline at end of file diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 8f06973c..c5e98c77 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -518,6 +518,12 @@ MSstatsMakeAnnotation = function(input, annotation, ...) { } #' Run Anomaly Model +#' +#' Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm. +#' This function identifies unusual precursor measurements based on quality metrics and their +#' temporal patterns. For features with insufficient quality metric data, it assigns anomaly +#' scores based on the median score of similar features (same peptide and charge combination). +#' The model supports parallel processing for improved performance on large datasets. #' #' @param input data.table preprocessed by the MSstatsBalancedDesign function #' @param quality_metrics character vector of quality metrics to use in the model @@ -535,7 +541,7 @@ MSstatsMakeAnnotation = function(input, annotation, ...) { MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, missing_run_count, n_feat, run_order, n_trees, max_depth, cores){ - + input = .prepareSpectronautAnomalyInput(input, quality_metrics, run_order, n_feat, missing_run_count) @@ -548,14 +554,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, temporal_direction[i])) } } - + input = .runAnomalyModel(input, n_trees=n_trees, max_depth=max_depth, cores=cores, split_column="PSM", quality_metrics=quality_metrics) - + subset_cols = c("Run", "ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "IsotopeLabelType", diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index afe75554..23f49c8b 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -8,7 +8,9 @@ quantificationColumn = "FragmentQuantCorrected", global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, - pg_qvalue_cutoff = 0.01) { + pg_qvalue_cutoff = 0.01, + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c()) { dn_input <- getInputFile(msstats_object, "input") dn_input <- data.table::as.data.table(dn_input) @@ -19,7 +21,9 @@ dn_input <- .cleanDIANNAddMissingColumns(dn_input) # Select required columns - dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR) + dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR, + calculateAnomalyScores, + anomalyModelFeatures) # Split concatenated values dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn) @@ -78,7 +82,9 @@ #' @param MBR logical indicating if match between runs was used #' @return data.table with selected columns #' @noRd -.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) { +.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR, + calculateAnomalyScores, + anomalyModelFeatures) { base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', 'PrecursorCharge', quantificationColumn, 'QValue', 'PrecursorMz', 'FragmentInfo', 'Run') @@ -89,7 +95,13 @@ c('GlobalQValue', 'GlobalPGQValue') } - req_cols <- c(base_cols, mbr_cols) + qual_cols <- if (calculateAnomalyScores) { + anomalyModelFeatures + } else { + c() + } + + req_cols <- c(base_cols, mbr_cols, qual_cols) return(dn_input[, req_cols, with = FALSE]) } @@ -163,13 +175,13 @@ getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) - dn_input = dn_input[QValue < global_qvalue_cutoff, ] + dn_input = dn_input[QValue >= global_qvalue_cutoff, quantificationColumn := 0] if (MBR) { msg = '** MBR was used to analyze the data. Now setting names and filtering' msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff) msg_2_mbr = paste0('-- LibQValue < ', qvalue_cutoff) - dn_input = dn_input[LibPGQValue < pg_qvalue_cutoff, ] - dn_input = dn_input[LibQValue < qvalue_cutoff, ] + dn_input = dn_input[LibPGQValue >= pg_qvalue_cutoff, , quantificationColumn := 0] + dn_input = dn_input[LibQValue >= qvalue_cutoff, , quantificationColumn := 0] getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) getOption("MSstatsLog")("INFO", msg_1_mbr) @@ -181,8 +193,8 @@ msg = '** MBR was not used to analyze the data. Now setting names and filtering' msg_1 = paste0('-- Filtering on GlobalPGQValue < ', pg_qvalue_cutoff) msg_2 = paste0('-- Filtering on GlobalQValue < ', qvalue_cutoff) - dn_input = dn_input[GlobalPGQValue < pg_qvalue_cutoff, ] - dn_input = dn_input[GlobalQValue < qvalue_cutoff, ] + dn_input = dn_input[GlobalPGQValue >= pg_qvalue_cutoff, quantificationColumn := 0] + dn_input = dn_input[GlobalQValue >= qvalue_cutoff, quantificationColumn := 0] getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) getOption("MSstatsLog")("INFO", msg_1) diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R index 1b46976b..833990f6 100644 --- a/R/converters_DIANNtoMSstatsFormat.R +++ b/R/converters_DIANNtoMSstatsFormat.R @@ -22,6 +22,15 @@ #' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. #' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity. +#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis. +#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE. +#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL). +#' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE. +#' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE. +#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. +#' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100. +#' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer. +#' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1. #' @param ... additional parameters to `data.table::fread`. #' #' @return data.frame in the MSstats required format. @@ -51,29 +60,39 @@ #' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE, #' use_log_file = FALSE, quantificationColumn = 'auto') #' head(output_2_0) -DIANNtoMSstatsFormat = function(input, annotation = NULL, - global_qvalue_cutoff = 0.01, - qvalue_cutoff = 0.01, - pg_qvalue_cutoff = 0.01, - useUniquePeptide = TRUE, - removeFewMeasurements = TRUE, - removeOxidationMpeptides = TRUE, - removeProtein_with1Feature = TRUE, - use_log_file = TRUE, append = FALSE, - verbose = TRUE, log_file_path = NULL, - MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected", - ...) { +DIANNtoMSstatsFormat = function( + input, annotation = NULL, + global_qvalue_cutoff = 0.01, + qvalue_cutoff = 0.01, + pg_qvalue_cutoff = 0.01, + useUniquePeptide = TRUE, + removeFewMeasurements = TRUE, + removeOxidationMpeptides = TRUE, + removeProtein_with1Feature = TRUE, + MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores=FALSE, anomalyModelFeatures=c(), + anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5, + anomalyModelFeatureCount=100, + runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1, + use_log_file = TRUE, append = FALSE, + verbose = TRUE, log_file_path = NULL, + ...) { MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, log_file_path) + anomalyModelFeatures = .standardizeColnames(anomalyModelFeatures) + input = MSstatsConvert::MSstatsImport(list(input = input), "MSstats", "DIANN") + input = MSstatsConvert::MSstatsClean(input, MBR = MBR, quantificationColumn = quantificationColumn, global_qvalue_cutoff = global_qvalue_cutoff, qvalue_cutoff = qvalue_cutoff, - pg_qvalue_cutoff = pg_qvalue_cutoff) + pg_qvalue_cutoff = pg_qvalue_cutoff, + calculateAnomalyScores = calculateAnomalyScores, + anomalyModelFeatures = anomalyModelFeatures) annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) decoy_filter = list(col_name = "ProteinName", @@ -87,6 +106,7 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL, feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") + # browser() input = MSstatsConvert::MSstatsPreprocess( input, annotation, @@ -101,19 +121,28 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL, remove_features_with_few_measurements = removeFewMeasurements, summarize_multiple_psms = max), columns_to_fill = list(Fraction = 1, - IsotopeLabelType = "Light")) + IsotopeLabelType = "Light"), + anomaly_metrics = anomalyModelFeatures) input[, Intensity := ifelse(Intensity == 0, NA, Intensity)] - + # browser() input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, - fill_incomplete = FALSE, + fill_incomplete = TRUE, handle_fractions = FALSE, - remove_few = removeFewMeasurements + remove_few = removeFewMeasurements, + anomaly_metrics = anomalyModelFeatures ) - + # browser() + if (calculateAnomalyScores){ + input = MSstatsConvert::MSstatsAnomalyScores( + input, anomalyModelFeatures, anomalyModelFeatureTemporal, + removeMissingFeatures, anomalyModelFeatureCount, runOrder, n_trees, + max_depth, numberOfCores) + } + # browser() msg_final = paste("** Finished preprocessing. The dataset is ready", "to be processed by the dataProcess function.") getOption("MSstatsLog")("INFO", msg_final) getOption("MSstatsMsg")("INFO", msg_final) getOption("MSstatsLog")("INFO", "\n") input -} \ No newline at end of file +} diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R index d604f0b3..d5981732 100644 --- a/R/utils_anomaly_score.R +++ b/R/utils_anomaly_score.R @@ -79,66 +79,116 @@ #' Calculate mean increase #' @noRd -.add_mean_increase = function(quality_vector){ - - mean_increase = numeric(length(quality_vector)) - mean_increase[1] = 0 +.add_mean_increase = function(quality_vector) { + + n = length(quality_vector) + mean_increase = rep(NA_real_, n) + d = 0.5 - - for(k in 2:length(quality_vector)) { - # 5 is reference (3 sigma) - if (mean_increase[k] > 5){ - mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE) - } else { - mean_increase[k] = max(0, - (quality_vector[k] - d + mean_increase[k-1]), - na.rm = TRUE) # positive CuSum + h = 5 + + state = 0 + mean_increase[1] = if (is.na(quality_vector[1])) NA_real_ else 0 + + if (n >= 2) { + for (k in 2:n) { + + if (is.na(quality_vector[k])) { + mean_increase[k] = NA_real_ # output NA at missing + next # state unchanged (carry forward) + } + + if (state > h) { + state = max(0, quality_vector[k] - d) + } else { + state = max(0, quality_vector[k] - d + state) + } + + mean_increase[k] = state } } - return(mean_increase) + + mean_increase } #' Calculate mean decrease #' @noRd -.add_mean_decrease = function(quality_vector){ - - mean_decrease = numeric(length(quality_vector)) - mean_decrease[1] = 0 +.add_mean_decrease = function(quality_vector) { + + n = length(quality_vector) + mean_decrease = rep(NA_real_, n) + d = -0.5 - - for(k in 2:length(quality_vector)) { - # 5 is reference (3 sigma) - if (mean_decrease[k]>5){ - mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE) - } else { - mean_decrease[k] <- max(0, - (d - quality_vector[k] + mean_decrease[k-1]), - na.rm = TRUE) # negative CuSum + h = 5 + + state = 0 + mean_decrease[1] = + if (is.na(quality_vector[1])) NA_real_ else 0 + + if (n >= 2) { + for (k in 2:n) { + + if (is.na(quality_vector[k])) { + mean_decrease[k] = NA_real_ # mark missing + next # carry state forward + } + + if (state > h) { + state = max(0, d - quality_vector[k]) + } else { + state = max(0, d - quality_vector[k] + state) + } + + mean_decrease[k] = state } } - return(mean_decrease) + + mean_decrease } #' Calculate dispersion increase #' @noRd -.add_dispersion_increase = function(quality_vector){ - dispersion_increase = numeric(length(quality_vector)) - v = numeric(length(quality_vector)) - v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349 +.add_dispersion_increase = function(quality_vector) { + + n = length(quality_vector) + dispersion_increase = rep(NA_real_, n) + v = rep(NA_real_, n) + d = 0.5 - for(k in 2:length(quality_vector)) { - - v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 - - if (dispersion_increase[k] > 5){ - dispersion_increase[k] = max(0,(v[k] - d), - na.rm = TRUE) - } else { - dispersion_increase[k] = max(0, - (v[k] - d + dispersion_increase[k-1]), - na.rm = TRUE) # CuSum variance + h = 5 + + # initialize state + state = 0 + + if (!is.na(quality_vector[1])) { + v[1] = (sqrt(abs(quality_vector[1])) - 0.822) / 0.349 + dispersion_increase[1] = 0 + } else { + v[1] = NA_real_ + dispersion_increase[1] = NA_real_ + } + + if (n >= 2) { + for (k in 2:n) { + + if (is.na(quality_vector[k])) { + v[k] = NA_real_ + dispersion_increase[k] = NA_real_ # mark missing + next # carry state forward + } + + v[k] = (sqrt(abs(quality_vector[k])) - 0.822) / 0.349 + + if (state > h) { + state = max(0, v[k] - d) + } else { + state = max(0, v[k] - d + state) + } + + dispersion_increase[k] = state } } + return(dispersion_increase) } @@ -172,6 +222,8 @@ cat(paste0("Number of PSMs to process: ", num_psm), sep = "\n", file = "MSstats_anomaly_model_progress.log") + model_input = unique(input_data[, c(split_column, quality_metrics), with = FALSE]) + model_results = parallel::parLapply( cl, seq_len(num_psm), function(i){ @@ -179,7 +231,7 @@ cat("Finished processing an additional 100 PSMs", sep = "\n", file = "MSstats_anomaly_model_progress.log", append = TRUE) } - single_psm = input_data[get(split_column) == psm_list[[i]], + single_psm = model_input[model_input[[split_column]] == psm_list[[i]], ..quality_metrics] if (max_depth == "auto"){ @@ -192,9 +244,9 @@ } ) - model_results = unlist(model_results) - # Clip anomaly scores to stop them from exploding - input_data$AnomalyScores = pmax(model_results, .001) - + model_input$AnomalyScores = unlist(model_results) + + input_data = merge(input_data, model_input, by = c(split_column, quality_metrics), all.x = TRUE) + return(input_data) } diff --git a/man/DIANNtoMSstatsFormat.Rd b/man/DIANNtoMSstatsFormat.Rd index 2ea6e068..b7566ee5 100644 --- a/man/DIANNtoMSstatsFormat.Rd +++ b/man/DIANNtoMSstatsFormat.Rd @@ -14,12 +14,21 @@ DIANNtoMSstatsFormat( removeFewMeasurements = TRUE, removeOxidationMpeptides = TRUE, removeProtein_with1Feature = TRUE, + MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c(), + anomalyModelFeatureTemporal = c(), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = NULL, + n_trees = 100, + max_depth = "auto", + numberOfCores = 1, use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL, - MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected", ... ) } @@ -50,6 +59,30 @@ the library created after the first MBR pass. Default is 0.01.} \item{removeProtein_with1Feature}{should proteins with a single feature be removed} +\item{MBR}{True if analysis was done with match between runs} + +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} + +\item{calculateAnomalyScores}{Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.} + +\item{anomalyModelFeatures}{character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.} + +\item{anomalyModelFeatureTemporal}{character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: \code{mean_decrease}, \code{mean_increase}, \code{dispersion_increase}, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).} + +\item{removeMissingFeatures}{Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.} + +\item{anomalyModelFeatureCount}{Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.} + +\item{runOrder}{Temporal order of MS runs. Should be a two column data.table with columns \code{Run} and \code{Order}, where \code{Run} matches the run name output by Spectronaut and \code{Order} is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.} + +\item{n_trees}{Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.} + +\item{max_depth}{Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.} + +\item{numberOfCores}{Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.} + \item{use_log_file}{logical. If TRUE, information about data processing will be saved to a file.} @@ -64,12 +97,6 @@ data processing will be saved. If not provided, such a file will be created automatically. If \code{append = TRUE}, has to be a valid path to a file.} -\item{MBR}{True if analysis was done with match between runs} - -\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. -Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. -Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} - \item{...}{additional parameters to \code{data.table::fread}.} } \value{ diff --git a/man/MSstatsAnomalyScores.Rd b/man/MSstatsAnomalyScores.Rd index bc2b18a3..b6a9d500 100644 --- a/man/MSstatsAnomalyScores.Rd +++ b/man/MSstatsAnomalyScores.Rd @@ -39,5 +39,9 @@ MSstatsAnomalyScores( data.table } \description{ -Run Anomaly Model +Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm. +This function identifies unusual precursor measurements based on quality metrics and their +temporal patterns. For features with insufficient quality metric data, it assigns anomaly +scores based on the median score of similar features (same peptide and charge combination). +The model supports parallel processing for improved performance on large datasets. } diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index c5338a47..8c11da0c 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -69,7 +69,9 @@ MSstatsClean(msstats_object, ...) quantificationColumn = "FragmentQuantCorrected", global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, - pg_qvalue_cutoff = 0.01 + pg_qvalue_cutoff = 0.01, + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c() ) \S4method{MSstatsClean}{MSstatsMetamorpheusFiles}(msstats_object, MBR = TRUE, qvalue_cutoff = 0.05) diff --git a/man/dot-cleanRawDIANN.Rd b/man/dot-cleanRawDIANN.Rd index 6a61b2cc..dbdd9840 100644 --- a/man/dot-cleanRawDIANN.Rd +++ b/man/dot-cleanRawDIANN.Rd @@ -10,7 +10,9 @@ quantificationColumn = "FragmentQuantCorrected", global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, - pg_qvalue_cutoff = 0.01 + pg_qvalue_cutoff = 0.01, + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c() ) } \arguments{ diff --git a/src/isolation_forest.cpp b/src/isolation_forest.cpp index f1abac22..7ac6dae9 100644 --- a/src/isolation_forest.cpp +++ b/src/isolation_forest.cpp @@ -201,6 +201,269 @@ DataFrame calculate_anomaly_score( ); } +// #include +// #include +// #include +// #include +// #include +// #include +// #include + +// using namespace Rcpp; + +// const double EM_CONSTANT = 0.5772156649; // Euler-Mascheroni constant +// std::mt19937 gen(42); // Fixed random seed for reproducibility + +// struct IsolationTreeNode { +// bool is_leaf; +// int size; +// std::string feature; +// double value; +// bool is_missing_split; +// std::unique_ptr left; +// std::unique_ptr right; + +// IsolationTreeNode(int s) +// : is_leaf(true), +// size(s), +// feature(""), +// value(0.0), +// is_missing_split(false), +// left(nullptr), +// right(nullptr) {} + +// IsolationTreeNode( +// std::string feat, +// double val, +// bool missing_split, +// std::unique_ptr l, +// std::unique_ptr r +// ) +// : is_leaf(false), +// size(0), +// feature(std::move(feat)), +// value(val), +// is_missing_split(missing_split), +// left(std::move(l)), +// right(std::move(r)) {} +// }; + +// std::vector> +// convert_dataframe(const DataFrame& df) { +// std::vector> data; + +// int n_rows = df.nrows(); +// CharacterVector col_names = df.names(); +// int n_cols = col_names.size(); + +// data.reserve(n_rows); + +// for (int i = 0; i < n_rows; ++i) { +// std::unordered_map row; +// row.reserve(n_cols); + +// for (int j = 0; j < n_cols; ++j) { +// NumericVector col = df[j]; +// row[as(col_names[j])] = col[i]; +// } +// data.push_back(std::move(row)); +// } + +// return data; +// } + +// std::unique_ptr +// isolation_tree( +// const std::vector>& data, +// int depth = 0, +// int max_depth = -1 +// ) { +// int n = static_cast(data.size()); + +// if (max_depth == -1) { +// max_depth = static_cast(std::log2(n)); +// } + +// if (n <= 1 || depth >= max_depth) { +// return std::make_unique(n); +// } + +// std::vector features; +// features.reserve(data[0].size()); + +// for (const auto& kv : data[0]) { +// features.push_back(kv.first); +// } + +// std::uniform_int_distribution<> feat_dist(0, features.size() - 1); +// std::string split_feature = features[feat_dist(gen)]; + +// double min_val = data[0].at(split_feature); +// double max_val = min_val; +// int missing_count = 0; + +// for (const auto& row : data) { +// double val = row.at(split_feature); + +// if (std::isnan(val)) { +// missing_count++; +// continue; +// } + +// min_val = std::min(min_val, val); +// max_val = std::max(max_val, val); +// } + +// bool has_missing = (missing_count > 0); + +// double missing_fraction = +// static_cast(missing_count) / static_cast(n); + +// if (min_val == max_val && !has_missing) { +// return std::make_unique(n); +// } + +// // Calibrated missing split probability: +// // - rare missing -> rare split (more anomalous) +// // - common missing -> more missing splits (less anomalous) +// double p_miss = std::min(0.5, missing_fraction); + +// bool is_missing_split = false; +// double split_value = 0.0; + +// if (has_missing && +// (std::bernoulli_distribution(p_miss)(gen) || min_val == max_val)) { +// is_missing_split = true; +// } else { +// std::uniform_real_distribution<> split_dist(min_val, max_val); +// split_value = split_dist(gen); +// } + +// std::vector> left_data; +// std::vector> right_data; +// left_data.reserve(n); +// right_data.reserve(n); + +// for (const auto& row : data) { +// double v = row.at(split_feature); + +// if (is_missing_split) { +// if (std::isnan(v)) { +// left_data.push_back(row); +// } else { +// right_data.push_back(row); +// } +// } else { +// if (v < split_value) { +// left_data.push_back(row); +// } else { +// right_data.push_back(row); +// } +// } +// } + +// return std::make_unique( +// split_feature, +// split_value, +// is_missing_split, +// isolation_tree(left_data, depth + 1, max_depth), +// isolation_tree(right_data, depth + 1, max_depth) +// ); +// } + +// std::vector> +// isolation_forest( +// const std::vector>& data, +// int n_trees, +// int max_depth +// ) { +// std::vector> forest; +// forest.reserve(n_trees); + +// for (int i = 0; i < n_trees; ++i) { +// forest.push_back(isolation_tree(data, 0, max_depth)); +// } + +// return forest; +// } + +// // IMPORTANT FIX: respect is_missing_split during traversal +// int path_length( +// const IsolationTreeNode* node, +// const std::unordered_map& obs, +// int depth = 0 +// ) { +// if (node->is_leaf) { +// return depth + node->size; +// } + +// auto it = obs.find(node->feature); +// if (it == obs.end()) { +// // Feature absent from obs map. Fall back: stop at current depth. +// return depth; +// } + +// double x = it->second; + +// if (node->is_missing_split) { +// if (std::isnan(x)) { +// return path_length(node->left.get(), obs, depth + 1); +// } +// return path_length(node->right.get(), obs, depth + 1); +// } + +// if (std::isnan(x)) { +// // Numeric split but x missing: choose a side deterministically. +// // Here: treat missing as going left. +// return path_length(node->left.get(), obs, depth + 1); +// } + +// if (x < node->value) { +// return path_length(node->left.get(), obs, depth + 1); +// } + +// return path_length(node->right.get(), obs, depth + 1); +// } + +// // [[Rcpp::export]] +// DataFrame calculate_anomaly_score(DataFrame df, int n_trees, int max_depth) { +// std::vector> data = +// convert_dataframe(df); + +// int n = static_cast(data.size()); + +// auto forest = isolation_forest(data, n_trees, max_depth); + +// std::vector avg_path_length(n, 0.0); + +// for (int i = 0; i < n; ++i) { +// double total_path_length = 0.0; + +// for (const auto& tree : forest) { +// total_path_length += path_length(tree.get(), data[i]); +// } + +// avg_path_length[i] = total_path_length / static_cast(n_trees); +// } + +// double c_n = 2.0 * (std::log(n - 1.0) + EM_CONSTANT) - +// (2.0 * (n - 1.0) / n); + +// std::vector scores(n, 0.0); + +// for (int i = 0; i < n; ++i) { +// scores[i] = std::pow(2.0, -(avg_path_length[i] / c_n)); +// } + +// return DataFrame::create( +// _["avg_depth"] = avg_path_length, +// _["anomaly_score"] = scores +// ); +// } + + +// TODO: Parallelization would be faster within RcppParallel framework +// but requires setup and testing. // Parallel task struct to process each data frame // struct ParallelTask : public Worker { // // Input list of data frames