From 4f6941d77a500e5c8ee8dfdbc7355099f920c0a4 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 13 Jan 2026 10:05:39 -0500 Subject: [PATCH 01/11] DIANN MSstatsplus converter --- .gitignore | 4 +- R/clean_DIANN.R | 20 +++++-- R/converters_DIANNtoMSstatsFormat.R | 82 ++++++++++++++++++++--------- 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 4f1fdc64..9d0335a5 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ inst/doc *.log *.o *.so -*.dll \ No newline at end of file +*.dll +.lintr +.vscode \ No newline at end of file diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index 2c8a46b7..83f24aff 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -8,7 +8,9 @@ #' @importFrom stats na.omit #' @keywords internal .cleanRawDIANN <- function(msstats_object, MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected") { + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c()) { dn_input <- getInputFile(msstats_object, "input") dn_input <- data.table::as.data.table(dn_input) @@ -19,7 +21,9 @@ dn_input <- .cleanDIANNAddMissingColumns(dn_input) # Select required columns - dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR) + dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR, + calculateAnomalyScores, + anomalyModelFeatures) # Split concatenated values dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn) @@ -75,7 +79,9 @@ #' @param MBR logical indicating if match between runs was used #' @return data.table with selected columns #' @noRd -.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) { +.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR, + calculateAnomalyScores, + anomalyModelFeatures) { base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', 'PrecursorCharge', quantificationColumn, 'QValue', 'PrecursorMz', 'FragmentInfo', 'Run') @@ -86,7 +92,13 @@ c('GlobalQValue', 'GlobalPGQValue') } - req_cols <- c(base_cols, mbr_cols) + qual_cols <- if (calculateAnomalyScores) { + anomalyModelFeatures + } else { + c() + } + + req_cols <- c(base_cols, mbr_cols, qual_cols) return(dn_input[, req_cols, with = FALSE]) } diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R index 22d6223a..806ce95b 100644 --- a/R/converters_DIANNtoMSstatsFormat.R +++ b/R/converters_DIANNtoMSstatsFormat.R @@ -23,6 +23,15 @@ #' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. #' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity. +#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis. +#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE. +#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL). +#' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE. +#' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE. +#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. +#' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100. +#' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer. +#' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1. #' @param ... additional parameters to `data.table::fread`. #' #' @return data.frame in the MSstats required format. @@ -52,26 +61,36 @@ #' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE, #' use_log_file = FALSE, quantificationColumn = 'auto') #' head(output_2_0) -DIANNtoMSstatsFormat = function(input, annotation = NULL, - global_qvalue_cutoff = 0.01, - qvalue_cutoff = 0.01, - pg_qvalue_cutoff = 0.01, - useUniquePeptide = TRUE, - removeFewMeasurements = TRUE, - removeOxidationMpeptides = TRUE, - removeProtein_with1Feature = TRUE, - use_log_file = TRUE, append = FALSE, - verbose = TRUE, log_file_path = NULL, - MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected", - ...) { +DIANNtoMSstatsFormat = function( + input, annotation = NULL, + global_qvalue_cutoff = 0.01, + qvalue_cutoff = 0.01, + pg_qvalue_cutoff = 0.01, + useUniquePeptide = TRUE, + removeFewMeasurements = TRUE, + removeOxidationMpeptides = TRUE, + removeProtein_with1Feature = TRUE, + MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores=FALSE, anomalyModelFeatures=c(), + anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5, + anomalyModelFeatureCount=100, + runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1, + use_log_file = TRUE, append = FALSE, + verbose = TRUE, log_file_path = NULL, + ...) { MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, log_file_path) + anomalyModelFeatures = .standardizeColnames(anomalyModelFeatures) + input = MSstatsConvert::MSstatsImport(list(input = input), "MSstats", "DIANN") - input = MSstatsConvert::MSstatsClean(input, MBR = MBR, - quantificationColumn = quantificationColumn) + # browser() + input = MSstatsConvert::MSstatsClean( + input, MBR = MBR, quantificationColumn = quantificationColumn, + calculateAnomalyScores = calculateAnomalyScores, + anomalyModelFeatures = anomalyModelFeatures) annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) decoy_filter = list(col_name = "ProteinName", @@ -86,14 +105,15 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL, msg = paste0('** Filtering on Global Q Value < ', global_qvalue_cutoff) getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) - - input = input[DetectionQValue < global_qvalue_cutoff, ] + # browser() + input[DetectionQValue >= global_qvalue_cutoff, Intensity := 0] + if (MBR) { msg = '** MBR was used to analyze the data. Now setting names and filtering' msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff) msg_2_mbr = paste0('-- LibQValue < ', qvalue_cutoff) - input = input[LibPGQValue < pg_qvalue_cutoff, ] - input = input[LibQValue < qvalue_cutoff, ] + input = input[LibPGQValue >= pg_qvalue_cutoff, Intensity := 0] + input = input[LibQValue >= qvalue_cutoff, Intensity := 0] getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) getOption("MSstatsLog")("INFO", msg_1_mbr) @@ -105,8 +125,8 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL, msg = '** MBR was not used to analyze the data. Now setting names and filtering' msg_1 = paste0('-- Filtering on GlobalPGQValue < ', pg_qvalue_cutoff) msg_2 = paste0('-- Filtering on GlobalQValue < ', qvalue_cutoff) - input = input[GlobalPGQValue < pg_qvalue_cutoff, ] - input = input[GlobalQValue < qvalue_cutoff, ] + input = input[GlobalPGQValue >= pg_qvalue_cutoff, Intensity := 0] + input = input[GlobalQValue >= qvalue_cutoff, Intensity := 0] getOption("MSstatsLog")("INFO", msg) getOption("MSstatsMsg")("INFO", msg) getOption("MSstatsLog")("INFO", msg_1) @@ -118,6 +138,7 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL, feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") + # browser() input = MSstatsConvert::MSstatsPreprocess( input, annotation, @@ -132,15 +153,24 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL, remove_features_with_few_measurements = removeFewMeasurements, summarize_multiple_psms = max), columns_to_fill = list(Fraction = 1, - IsotopeLabelType = "Light")) + IsotopeLabelType = "Light"), + anomaly_metrics = anomalyModelFeatures) input[, Intensity := ifelse(Intensity == 0, NA, Intensity)] - + # browser() input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, - fill_incomplete = FALSE, + fill_incomplete = TRUE, handle_fractions = FALSE, - remove_few = removeFewMeasurements + remove_few = removeFewMeasurements, + anomaly_metrics = anomalyModelFeatures ) - + # browser() + if (calculateAnomalyScores){ + input = MSstatsConvert::MSstatsAnomalyScores( + input, anomalyModelFeatures, anomalyModelFeatureTemporal, + removeMissingFeatures, anomalyModelFeatureCount, runOrder, n_trees, + max_depth, numberOfCores) + } + # browser() msg_final = paste("** Finished preprocessing. The dataset is ready", "to be processed by the dataProcess function.") getOption("MSstatsLog")("INFO", msg_final) From 46bfc89a625cdbcb2617d24da2eaf7d10bccf5d3 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 13 Jan 2026 10:05:54 -0500 Subject: [PATCH 02/11] minor bug fixes to anomaly detection model --- R/utils_anomaly_score.R | 192 ++++++++++++++++++++++++++-------- src/isolation_forest.cpp | 215 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 364 insertions(+), 43 deletions(-) diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R index d604f0b3..753e13b7 100644 --- a/R/utils_anomaly_score.R +++ b/R/utils_anomaly_score.R @@ -79,68 +79,174 @@ #' Calculate mean increase #' @noRd -.add_mean_increase = function(quality_vector){ - - mean_increase = numeric(length(quality_vector)) - mean_increase[1] = 0 +.add_mean_increase = function(quality_vector) { + + n = length(quality_vector) + mean_increase = rep(NA_real_, n) + d = 0.5 - - for(k in 2:length(quality_vector)) { - # 5 is reference (3 sigma) - if (mean_increase[k] > 5){ - mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE) - } else { - mean_increase[k] = max(0, - (quality_vector[k] - d + mean_increase[k-1]), - na.rm = TRUE) # positive CuSum + h = 5 + + state = 0 + mean_increase[1] = if (is.na(quality_vector[1])) NA_real_ else 0 + + if (n >= 2) { + for (k in 2:n) { + + if (is.na(quality_vector[k])) { + mean_increase[k] = NA_real_ # output NA at missing + next # state unchanged (carry forward) + } + + if (state > h) { + state = max(0, quality_vector[k] - d) + } else { + state = max(0, quality_vector[k] - d + state) + } + + mean_increase[k] = state } } - return(mean_increase) + + mean_increase } +# .add_mean_increase = function(quality_vector){ + +# mean_increase = numeric(length(quality_vector)) +# mean_increase[1] = 0 +# d = 0.5 + +# for(k in 2:length(quality_vector)) { +# # 5 is reference (3 sigma) +# if (mean_increase[k - 1] > 5){ +# mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE) +# } else { +# mean_increase[k] = max(0, +# (quality_vector[k] - d + mean_increase[k-1]), +# na.rm = TRUE) # positive CuSum +# } +# } +# return(mean_increase) +# } #' Calculate mean decrease #' @noRd -.add_mean_decrease = function(quality_vector){ - - mean_decrease = numeric(length(quality_vector)) - mean_decrease[1] = 0 +.add_mean_decrease = function(quality_vector) { + + n = length(quality_vector) + mean_decrease = rep(NA_real_, n) + d = -0.5 - - for(k in 2:length(quality_vector)) { - # 5 is reference (3 sigma) - if (mean_decrease[k]>5){ - mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE) - } else { - mean_decrease[k] <- max(0, - (d - quality_vector[k] + mean_decrease[k-1]), - na.rm = TRUE) # negative CuSum + h = 5 + + state = 0 + mean_decrease[1] = + if (is.na(quality_vector[1])) NA_real_ else 0 + + if (n >= 2) { + for (k in 2:n) { + + if (is.na(quality_vector[k])) { + mean_decrease[k] = NA_real_ # mark missing + next # carry state forward + } + + if (state > h) { + state = max(0, d - quality_vector[k]) + } else { + state = max(0, d - quality_vector[k] + state) + } + + mean_decrease[k] = state } } - return(mean_decrease) + + mean_decrease } +# .add_mean_decrease = function(quality_vector){ + +# mean_decrease = numeric(length(quality_vector)) +# mean_decrease[1] = 0 +# d = -0.5 + +# for(k in 2:length(quality_vector)) { +# # 5 is reference (3 sigma) +# if (mean_decrease[k - 1] > 5){ +# mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE) +# } else { +# mean_decrease[k] <- max(0, +# (d - quality_vector[k] + mean_decrease[k-1]), +# na.rm = TRUE) # negative CuSum +# } +# } +# return(mean_decrease) +# } #' Calculate dispersion increase #' @noRd -.add_dispersion_increase = function(quality_vector){ - dispersion_increase = numeric(length(quality_vector)) - v = numeric(length(quality_vector)) - v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349 +.add_dispersion_increase = function(quality_vector) { + + n = length(quality_vector) + dispersion_increase = rep(NA_real_, n) + v = rep(NA_real_, n) + d = 0.5 - for(k in 2:length(quality_vector)) { - - v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 - - if (dispersion_increase[k] > 5){ - dispersion_increase[k] = max(0,(v[k] - d), - na.rm = TRUE) - } else { - dispersion_increase[k] = max(0, - (v[k] - d + dispersion_increase[k-1]), - na.rm = TRUE) # CuSum variance + h = 5 + + # initialize state + state = 0 + + if (!is.na(quality_vector[1])) { + v[1] = (sqrt(abs(quality_vector[1])) - 0.822) / 0.349 + dispersion_increase[1] = 0 + } else { + v[1] = NA_real_ + dispersion_increase[1] = NA_real_ + } + + if (n >= 2) { + for (k in 2:n) { + + if (is.na(quality_vector[k])) { + v[k] = NA_real_ + dispersion_increase[k] = NA_real_ # mark missing + next # carry state forward + } + + v[k] = (sqrt(abs(quality_vector[k])) - 0.822) / 0.349 + + if (state > h) { + state = max(0, v[k] - d) + } else { + state = max(0, v[k] - d + state) + } + + dispersion_increase[k] = state } } + return(dispersion_increase) } +# .add_dispersion_increase = function(quality_vector){ +# dispersion_increase = numeric(length(quality_vector)) +# v = numeric(length(quality_vector)) +# v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349 +# d = 0.5 +# for(k in 2:length(quality_vector)) { + +# v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 + +# if (dispersion_increase[k - 1] > 5){ +# dispersion_increase[k] = max(0,(v[k] - d), +# na.rm = TRUE) +# } else { +# dispersion_increase[k] = max(0, +# (v[k] - d + dispersion_increase[k-1]), +# na.rm = TRUE) # CuSum variance +# } +# } +# return(dispersion_increase) +# } #' Train isolation forest model in parallel #' @import parallel diff --git a/src/isolation_forest.cpp b/src/isolation_forest.cpp index f1abac22..1f1972a7 100644 --- a/src/isolation_forest.cpp +++ b/src/isolation_forest.cpp @@ -201,6 +201,221 @@ DataFrame calculate_anomaly_score( ); } +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// // #include + +// using namespace Rcpp; +// // using namespace RcppParallel; + + +// const double EM_CONSTANT = 0.5772156649; // Euler-Mascheroni constant +// std::mt19937 gen(42); // Fixed random seed for reproducibility + +// // Define a structure for an Isolation Tree Node +// struct IsolationTreeNode { +// bool is_leaf; +// int size; +// std::string feature; +// double value; +// bool is_missing_split; +// std::unique_ptr left; +// std::unique_ptr right; + +// IsolationTreeNode(int s) : is_leaf(true), size(s), value(0), is_missing_split(false) {} +// IsolationTreeNode( +// std::string feat, +// double val, +// bool missing_split, +// std::unique_ptr l, +// std::unique_ptr r) +// : is_leaf(false), size(0), +// feature(std::move(feat)), +// value(val), is_missing_split(missing_split), +// left(std::move(l)), right(std::move(r)) {} +// }; + +// // Convert R DataFrame to C++ data structure +// std::vector> convert_dataframe(const DataFrame& df) { +// std::vector> data; +// int n_rows = df.nrows(); +// CharacterVector col_names = df.names(); +// int n_cols = col_names.size(); + +// for (int i = 0; i < n_rows; ++i) { +// std::unordered_map row; +// for (int j = 0; j < n_cols; ++j) { +// NumericVector col = df[j]; +// row[as(col_names[j])] = col[i]; +// } +// data.push_back(row); +// } +// return data; +// } + +// // Function to create an isolation tree +// std::unique_ptr isolation_tree( +// const std::vector>& data, +// int depth = 0, int max_depth = -1) { + +// int n = data.size(); +// if (max_depth == -1) { +// max_depth = static_cast(log2(n)); +// } +// if (n <= 1 || depth >= max_depth) { +// return std::make_unique(n); +// } + +// // Create vector of features to split on +// std::vector features; +// for (const auto& [key, _] : data[0]) { +// features.push_back(key); +// } + +// std::uniform_int_distribution<> feature_dist(0, features.size() - 1); +// std::string split_feature = features[feature_dist(gen)]; + +// // Can split on numeric or missing value +// double min_val = data[0].at(split_feature); +// double max_val = min_val; +// int missing_count = 0; + +// for (const auto& row : data) { +// double val = row.at(split_feature); +// if (std::isnan(val)) { +// missing_count++; +// continue; +// } +// min_val = std::min(min_val, val); +// max_val = std::max(max_val, val); +// } + +// bool has_missing = (missing_count > 0); +// double missing_fraction = +// static_cast(missing_count) / static_cast(n); + +// if (min_val == max_val && !has_missing) { +// return std::make_unique(n); +// } + +// // --- NEW: calibrated missing split probability --- +// double p_miss = std::min(0.5, missing_fraction); + +// bool is_missing_split = false; +// double split_value = 0.0; + +// if (has_missing && +// (std::bernoulli_distribution(p_miss)(gen) || min_val == max_val)) { +// is_missing_split = true; +// } else { +// std::uniform_real_distribution<> split_dist(min_val, max_val); +// split_value = split_dist(gen); +// } + +// std::vector> left_data, right_data; +// for (const auto& row : data) { +// if (is_missing_split) { +// if (std::isnan(row.at(split_feature))) { +// left_data.push_back(row); +// } else { +// right_data.push_back(row); +// } +// } else { +// if (row.at(split_feature) < split_value) { +// left_data.push_back(row); +// } else { +// right_data.push_back(row); +// } +// } +// } + +// return std::make_unique( +// split_feature, split_value, is_missing_split, +// isolation_tree(left_data, depth + 1, max_depth), +// isolation_tree(right_data, depth + 1, max_depth) +// ); +// } + +// // Train an isolation forest +// std::vector> isolation_forest( +// const std::vector>& data, +// int n_trees, +// int max_depth +// ) { +// std::vector> forest; +// forest.reserve(n_trees); + +// for (int i = 0; i < n_trees; ++i) { +// forest.push_back(isolation_tree(data, 0, max_depth)); +// } + +// return forest; +// } + +// // Compute path length of a single observation in a tree +// int path_length(const IsolationTreeNode* node, const std::unordered_map& obs, int depth = 0) { +// if (node->is_leaf) { +// return depth + node->size; +// } + +// auto it = obs.find(node->feature); +// if (it == obs.end()) { +// return depth; // If feature is missing, return current depth +// } + +// if (it->second < node->value) { +// return path_length(node->left.get(), obs, depth + 1); +// } else { +// return path_length(node->right.get(), obs, depth + 1); +// } +// } + +// // Compute anomaly scores for data +// // [[Rcpp::export]] +// DataFrame calculate_anomaly_score( +// DataFrame df, +// int n_trees, +// int max_depth +// ) { + +// std::vector> data = convert_dataframe(df); + +// int n = data.size(); + +// // Generate the forest +// auto forest = isolation_forest(data, n_trees, max_depth); + +// // Compute average path lengths +// std::vector avg_path_length(n, 0.0); +// for (int i = 0; i < n; ++i) { +// double total_path_length = 0.0; +// for (const auto& tree : forest) { +// total_path_length += path_length(tree.get(), data[i]); +// } +// avg_path_length[i] = total_path_length / n_trees; +// } + +// // Compute anomaly scores +// double c_n = 2 * (log(n - 1) + EM_CONSTANT) - (2 * (n - 1) / n); +// std::vector scores(n); +// for (int i = 0; i < n; ++i) { +// scores[i] = std::pow(2, -(avg_path_length[i] / c_n)); +// } + +// // Return DataFrame with results +// return DataFrame::create( +// _["avg_depth"] = avg_path_length, +// _["anomaly_score"] = scores +// ); +// } + +// TODO: Parallelization would be faster within RcppParallel framework +// but requires setup and testing. // Parallel task struct to process each data frame // struct ParallelTask : public Worker { // // Input list of data frames From 2e0f812a765a4900d261ded47e1fcfd87cf6c133 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Mon, 9 Feb 2026 08:54:26 -0500 Subject: [PATCH 03/11] diann missing values --- R/MSstatsConvert_core_functions.R | 24 ++- R/utils_anomaly_score.R | 1 + src/isolation_forest.cpp | 286 +++++++++++++++++------------- 3 files changed, 190 insertions(+), 121 deletions(-) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 8f06973c..27ce3678 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -548,14 +548,34 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, temporal_direction[i])) } } - - input = .runAnomalyModel(input, + + + idx_all_missing = input[, + rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), + .SDcols = quality_metrics] + + input_missing_quality = input[idx_all_missing] + input_measured_quality = input[!idx_all_missing] + + + input_measured_quality = .runAnomalyModel(input_measured_quality, n_trees=n_trees, max_depth=max_depth, cores=cores, split_column="PSM", quality_metrics=quality_metrics) + # Calculate median anomaly score for each PSM + median_scores = input_measured_quality[, + .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] + input_missing_quality = merge( + input_missing_quality, median_scores, by = "PSM", all.x = TRUE) + input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore + input_missing_quality$MedianAnomalyScore = NULL + + # Combine measured and missing quality data + input = rbind(input_measured_quality, input_missing_quality, fill = TRUE) + subset_cols = c("Run", "ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "IsotopeLabelType", diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R index 753e13b7..a048cdda 100644 --- a/R/utils_anomaly_score.R +++ b/R/utils_anomaly_score.R @@ -278,6 +278,7 @@ cat(paste0("Number of PSMs to process: ", num_psm), sep = "\n", file = "MSstats_anomaly_model_progress.log") + # input_data = na.omit(input_data) model_results = parallel::parLapply( cl, seq_len(num_psm), function(i){ diff --git a/src/isolation_forest.cpp b/src/isolation_forest.cpp index 1f1972a7..7ac6dae9 100644 --- a/src/isolation_forest.cpp +++ b/src/isolation_forest.cpp @@ -208,16 +208,12 @@ DataFrame calculate_anomaly_score( // #include // #include // #include -// // #include // using namespace Rcpp; -// // using namespace RcppParallel; - // const double EM_CONSTANT = 0.5772156649; // Euler-Mascheroni constant // std::mt19937 gen(42); // Fixed random seed for reproducibility -// // Define a structure for an Isolation Tree Node // struct IsolationTreeNode { // bool is_leaf; // int size; @@ -226,84 +222,110 @@ DataFrame calculate_anomaly_score( // bool is_missing_split; // std::unique_ptr left; // std::unique_ptr right; - -// IsolationTreeNode(int s) : is_leaf(true), size(s), value(0), is_missing_split(false) {} + +// IsolationTreeNode(int s) +// : is_leaf(true), +// size(s), +// feature(""), +// value(0.0), +// is_missing_split(false), +// left(nullptr), +// right(nullptr) {} + // IsolationTreeNode( -// std::string feat, -// double val, -// bool missing_split, -// std::unique_ptr l, -// std::unique_ptr r) -// : is_leaf(false), size(0), -// feature(std::move(feat)), -// value(val), is_missing_split(missing_split), -// left(std::move(l)), right(std::move(r)) {} +// std::string feat, +// double val, +// bool missing_split, +// std::unique_ptr l, +// std::unique_ptr r +// ) +// : is_leaf(false), +// size(0), +// feature(std::move(feat)), +// value(val), +// is_missing_split(missing_split), +// left(std::move(l)), +// right(std::move(r)) {} // }; -// // Convert R DataFrame to C++ data structure -// std::vector> convert_dataframe(const DataFrame& df) { -// std::vector> data; -// int n_rows = df.nrows(); -// CharacterVector col_names = df.names(); -// int n_cols = col_names.size(); - -// for (int i = 0; i < n_rows; ++i) { -// std::unordered_map row; -// for (int j = 0; j < n_cols; ++j) { -// NumericVector col = df[j]; -// row[as(col_names[j])] = col[i]; -// } -// data.push_back(row); +// std::vector> +// convert_dataframe(const DataFrame& df) { +// std::vector> data; + +// int n_rows = df.nrows(); +// CharacterVector col_names = df.names(); +// int n_cols = col_names.size(); + +// data.reserve(n_rows); + +// for (int i = 0; i < n_rows; ++i) { +// std::unordered_map row; +// row.reserve(n_cols); + +// for (int j = 0; j < n_cols; ++j) { +// NumericVector col = df[j]; +// row[as(col_names[j])] = col[i]; // } -// return data; +// data.push_back(std::move(row)); +// } + +// return data; // } -// // Function to create an isolation tree -// std::unique_ptr isolation_tree( -// const std::vector>& data, -// int depth = 0, int max_depth = -1) { - -// int n = data.size(); +// std::unique_ptr +// isolation_tree( +// const std::vector>& data, +// int depth = 0, +// int max_depth = -1 +// ) { +// int n = static_cast(data.size()); + // if (max_depth == -1) { -// max_depth = static_cast(log2(n)); +// max_depth = static_cast(std::log2(n)); // } + // if (n <= 1 || depth >= max_depth) { // return std::make_unique(n); // } - -// // Create vector of features to split on + // std::vector features; -// for (const auto& [key, _] : data[0]) { -// features.push_back(key); +// features.reserve(data[0].size()); + +// for (const auto& kv : data[0]) { +// features.push_back(kv.first); // } - -// std::uniform_int_distribution<> feature_dist(0, features.size() - 1); -// std::string split_feature = features[feature_dist(gen)]; - -// // Can split on numeric or missing value + +// std::uniform_int_distribution<> feat_dist(0, features.size() - 1); +// std::string split_feature = features[feat_dist(gen)]; + // double min_val = data[0].at(split_feature); // double max_val = min_val; // int missing_count = 0; // for (const auto& row : data) { // double val = row.at(split_feature); + // if (std::isnan(val)) { // missing_count++; // continue; // } + // min_val = std::min(min_val, val); // max_val = std::max(max_val, val); // } // bool has_missing = (missing_count > 0); + // double missing_fraction = -// static_cast(missing_count) / static_cast(n); - +// static_cast(missing_count) / static_cast(n); + // if (min_val == max_val && !has_missing) { // return std::make_unique(n); // } - -// // --- NEW: calibrated missing split probability --- + +// // Calibrated missing split probability: +// // - rare missing -> rare split (more anomalous) +// // - common missing -> more missing splits (less anomalous) // double p_miss = std::min(0.5, missing_fraction); // bool is_missing_split = false; @@ -316,104 +338,130 @@ DataFrame calculate_anomaly_score( // std::uniform_real_distribution<> split_dist(min_val, max_val); // split_value = split_dist(gen); // } - -// std::vector> left_data, right_data; + +// std::vector> left_data; +// std::vector> right_data; +// left_data.reserve(n); +// right_data.reserve(n); + // for (const auto& row : data) { +// double v = row.at(split_feature); + // if (is_missing_split) { -// if (std::isnan(row.at(split_feature))) { +// if (std::isnan(v)) { // left_data.push_back(row); // } else { // right_data.push_back(row); // } // } else { -// if (row.at(split_feature) < split_value) { +// if (v < split_value) { // left_data.push_back(row); // } else { // right_data.push_back(row); // } // } // } - + // return std::make_unique( -// split_feature, split_value, is_missing_split, +// split_feature, +// split_value, +// is_missing_split, // isolation_tree(left_data, depth + 1, max_depth), // isolation_tree(right_data, depth + 1, max_depth) // ); // } -// // Train an isolation forest -// std::vector> isolation_forest( -// const std::vector>& data, -// int n_trees, -// int max_depth +// std::vector> +// isolation_forest( +// const std::vector>& data, +// int n_trees, +// int max_depth // ) { -// std::vector> forest; -// forest.reserve(n_trees); - -// for (int i = 0; i < n_trees; ++i) { -// forest.push_back(isolation_tree(data, 0, max_depth)); -// } - -// return forest; +// std::vector> forest; +// forest.reserve(n_trees); + +// for (int i = 0; i < n_trees; ++i) { +// forest.push_back(isolation_tree(data, 0, max_depth)); +// } + +// return forest; // } -// // Compute path length of a single observation in a tree -// int path_length(const IsolationTreeNode* node, const std::unordered_map& obs, int depth = 0) { -// if (node->is_leaf) { -// return depth + node->size; -// } - -// auto it = obs.find(node->feature); -// if (it == obs.end()) { -// return depth; // If feature is missing, return current depth -// } - -// if (it->second < node->value) { -// return path_length(node->left.get(), obs, depth + 1); -// } else { -// return path_length(node->right.get(), obs, depth + 1); +// // IMPORTANT FIX: respect is_missing_split during traversal +// int path_length( +// const IsolationTreeNode* node, +// const std::unordered_map& obs, +// int depth = 0 +// ) { +// if (node->is_leaf) { +// return depth + node->size; +// } + +// auto it = obs.find(node->feature); +// if (it == obs.end()) { +// // Feature absent from obs map. Fall back: stop at current depth. +// return depth; +// } + +// double x = it->second; + +// if (node->is_missing_split) { +// if (std::isnan(x)) { +// return path_length(node->left.get(), obs, depth + 1); // } +// return path_length(node->right.get(), obs, depth + 1); +// } + +// if (std::isnan(x)) { +// // Numeric split but x missing: choose a side deterministically. +// // Here: treat missing as going left. +// return path_length(node->left.get(), obs, depth + 1); +// } + +// if (x < node->value) { +// return path_length(node->left.get(), obs, depth + 1); +// } + +// return path_length(node->right.get(), obs, depth + 1); // } -// // Compute anomaly scores for data // // [[Rcpp::export]] -// DataFrame calculate_anomaly_score( -// DataFrame df, -// int n_trees, -// int max_depth -// ) { - -// std::vector> data = convert_dataframe(df); - -// int n = data.size(); - -// // Generate the forest -// auto forest = isolation_forest(data, n_trees, max_depth); - -// // Compute average path lengths -// std::vector avg_path_length(n, 0.0); -// for (int i = 0; i < n; ++i) { -// double total_path_length = 0.0; -// for (const auto& tree : forest) { -// total_path_length += path_length(tree.get(), data[i]); -// } -// avg_path_length[i] = total_path_length / n_trees; -// } - -// // Compute anomaly scores -// double c_n = 2 * (log(n - 1) + EM_CONSTANT) - (2 * (n - 1) / n); -// std::vector scores(n); -// for (int i = 0; i < n; ++i) { -// scores[i] = std::pow(2, -(avg_path_length[i] / c_n)); +// DataFrame calculate_anomaly_score(DataFrame df, int n_trees, int max_depth) { +// std::vector> data = +// convert_dataframe(df); + +// int n = static_cast(data.size()); + +// auto forest = isolation_forest(data, n_trees, max_depth); + +// std::vector avg_path_length(n, 0.0); + +// for (int i = 0; i < n; ++i) { +// double total_path_length = 0.0; + +// for (const auto& tree : forest) { +// total_path_length += path_length(tree.get(), data[i]); // } - -// // Return DataFrame with results -// return DataFrame::create( -// _["avg_depth"] = avg_path_length, -// _["anomaly_score"] = scores -// ); + +// avg_path_length[i] = total_path_length / static_cast(n_trees); +// } + +// double c_n = 2.0 * (std::log(n - 1.0) + EM_CONSTANT) - +// (2.0 * (n - 1.0) / n); + +// std::vector scores(n, 0.0); + +// for (int i = 0; i < n; ++i) { +// scores[i] = std::pow(2.0, -(avg_path_length[i] / c_n)); +// } + +// return DataFrame::create( +// _["avg_depth"] = avg_path_length, +// _["anomaly_score"] = scores +// ); // } + // TODO: Parallelization would be faster within RcppParallel framework // but requires setup and testing. // Parallel task struct to process each data frame From 7ec875289f2627b7b747f080f23deb31e6f72075 Mon Sep 17 00:00:00 2001 From: Devon Kohler Date: Wed, 18 Feb 2026 08:16:25 -0500 Subject: [PATCH 04/11] bug fixes to anomaly model --- R/MSstatsConvert_core_functions.R | 6 +++--- R/utils_anomaly_score.R | 9 +++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 27ce3678..f104387a 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -535,7 +535,7 @@ MSstatsMakeAnnotation = function(input, annotation, ...) { MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, missing_run_count, n_feat, run_order, n_trees, max_depth, cores){ - + input = .prepareSpectronautAnomalyInput(input, quality_metrics, run_order, n_feat, missing_run_count) @@ -549,7 +549,7 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, } } - + # browser() #AASDAIPPASPK idx_all_missing = input[, rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), .SDcols = quality_metrics] @@ -567,7 +567,7 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, # Calculate median anomaly score for each PSM median_scores = input_measured_quality[, - .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] + .(MedianAnomalyScore = mean(AnomalyScores, na.rm = TRUE)), by = PSM] input_missing_quality = merge( input_missing_quality, median_scores, by = "PSM", all.x = TRUE) input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R index a048cdda..33e91db9 100644 --- a/R/utils_anomaly_score.R +++ b/R/utils_anomaly_score.R @@ -278,6 +278,9 @@ cat(paste0("Number of PSMs to process: ", num_psm), sep = "\n", file = "MSstats_anomaly_model_progress.log") + # browser() + model_input = unique(input_data[, c(split_column, quality_metrics), with = FALSE]) + # input_data = na.omit(input_data) model_results = parallel::parLapply( cl, seq_len(num_psm), @@ -286,7 +289,7 @@ cat("Finished processing an additional 100 PSMs", sep = "\n", file = "MSstats_anomaly_model_progress.log", append = TRUE) } - single_psm = input_data[get(split_column) == psm_list[[i]], + single_psm = model_input[model_input[[split_column]] == psm_list[[i]], ..quality_metrics] if (max_depth == "auto"){ @@ -301,7 +304,9 @@ model_results = unlist(model_results) # Clip anomaly scores to stop them from exploding - input_data$AnomalyScores = pmax(model_results, .001) + model_input$AnomalyScores = pmax(model_results, .001) + # browser() + input_data = merge(input_data, model_input, by = c(split_column, quality_metrics), all.x = TRUE) return(input_data) } From ca4335c92dfebf3fe9a416b8ecb76a4583cc2322 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 18 Feb 2026 09:24:37 -0500 Subject: [PATCH 05/11] minor documentation to anomaly score functions --- R/MSstatsConvert_core_functions.R | 10 ++++-- R/utils_anomaly_score.R | 58 ------------------------------- 2 files changed, 7 insertions(+), 61 deletions(-) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index f104387a..2cb2a941 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -518,6 +518,12 @@ MSstatsMakeAnnotation = function(input, annotation, ...) { } #' Run Anomaly Model +#' +#' Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm. +#' This function identifies unusual precursor measurements based on quality metrics and their +#' temporal patterns. For features with insufficient quality metric data, it assigns anomaly +#' scores based on the median score of similar features (same peptide and charge combination). +#' The model supports parallel processing for improved performance on large datasets. #' #' @param input data.table preprocessed by the MSstatsBalancedDesign function #' @param quality_metrics character vector of quality metrics to use in the model @@ -549,7 +555,6 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, } } - # browser() #AASDAIPPASPK idx_all_missing = input[, rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), .SDcols = quality_metrics] @@ -557,7 +562,6 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, input_missing_quality = input[idx_all_missing] input_measured_quality = input[!idx_all_missing] - input_measured_quality = .runAnomalyModel(input_measured_quality, n_trees=n_trees, max_depth=max_depth, @@ -567,7 +571,7 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, # Calculate median anomaly score for each PSM median_scores = input_measured_quality[, - .(MedianAnomalyScore = mean(AnomalyScores, na.rm = TRUE)), by = PSM] + .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] input_missing_quality = merge( input_missing_quality, median_scores, by = "PSM", all.x = TRUE) input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R index 33e91db9..ee9badd1 100644 --- a/R/utils_anomaly_score.R +++ b/R/utils_anomaly_score.R @@ -110,24 +110,6 @@ mean_increase } -# .add_mean_increase = function(quality_vector){ - -# mean_increase = numeric(length(quality_vector)) -# mean_increase[1] = 0 -# d = 0.5 - -# for(k in 2:length(quality_vector)) { -# # 5 is reference (3 sigma) -# if (mean_increase[k - 1] > 5){ -# mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE) -# } else { -# mean_increase[k] = max(0, -# (quality_vector[k] - d + mean_increase[k-1]), -# na.rm = TRUE) # positive CuSum -# } -# } -# return(mean_increase) -# } #' Calculate mean decrease #' @noRd @@ -163,24 +145,6 @@ mean_decrease } -# .add_mean_decrease = function(quality_vector){ - -# mean_decrease = numeric(length(quality_vector)) -# mean_decrease[1] = 0 -# d = -0.5 - -# for(k in 2:length(quality_vector)) { -# # 5 is reference (3 sigma) -# if (mean_decrease[k - 1] > 5){ -# mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE) -# } else { -# mean_decrease[k] <- max(0, -# (d - quality_vector[k] + mean_decrease[k-1]), -# na.rm = TRUE) # negative CuSum -# } -# } -# return(mean_decrease) -# } #' Calculate dispersion increase #' @noRd @@ -227,26 +191,6 @@ return(dispersion_increase) } -# .add_dispersion_increase = function(quality_vector){ -# dispersion_increase = numeric(length(quality_vector)) -# v = numeric(length(quality_vector)) -# v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349 -# d = 0.5 -# for(k in 2:length(quality_vector)) { - -# v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 - -# if (dispersion_increase[k - 1] > 5){ -# dispersion_increase[k] = max(0,(v[k] - d), -# na.rm = TRUE) -# } else { -# dispersion_increase[k] = max(0, -# (v[k] - d + dispersion_increase[k-1]), -# na.rm = TRUE) # CuSum variance -# } -# } -# return(dispersion_increase) -# } #' Train isolation forest model in parallel #' @import parallel @@ -278,10 +222,8 @@ cat(paste0("Number of PSMs to process: ", num_psm), sep = "\n", file = "MSstats_anomaly_model_progress.log") - # browser() model_input = unique(input_data[, c(split_column, quality_metrics), with = FALSE]) - # input_data = na.omit(input_data) model_results = parallel::parLapply( cl, seq_len(num_psm), function(i){ From 4023a70e1e5c080847f4929fda12fc29c489076f Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 18 Feb 2026 09:25:23 -0500 Subject: [PATCH 06/11] DIA-NN msstats+ converter docs --- DESCRIPTION | 2 +- man/DIANNtoMSstatsFormat.Rd | 43 ++++++++++++++++++++++++++++++------- man/MSstatsAnomalyScores.Rd | 6 +++++- man/MSstatsClean.Rd | 4 +++- man/dot-cleanRawDIANN.Rd | 4 +++- 5 files changed, 47 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 81b3bb72..cb89a875 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ License: Artistic-2.0 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 biocViews: MassSpectrometry, Proteomics, Software, DataImport, QualityControl Depends: R (>= 4.0) diff --git a/man/DIANNtoMSstatsFormat.Rd b/man/DIANNtoMSstatsFormat.Rd index 269db0a3..f46b2ad0 100644 --- a/man/DIANNtoMSstatsFormat.Rd +++ b/man/DIANNtoMSstatsFormat.Rd @@ -14,12 +14,21 @@ DIANNtoMSstatsFormat( removeFewMeasurements = TRUE, removeOxidationMpeptides = TRUE, removeProtein_with1Feature = TRUE, + MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c(), + anomalyModelFeatureTemporal = c(), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = NULL, + n_trees = 100, + max_depth = "auto", + numberOfCores = 1, use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL, - MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected", ... ) } @@ -51,6 +60,30 @@ Default is 0.01.} \item{removeProtein_with1Feature}{should proteins with a single feature be removed} +\item{MBR}{True if analysis was done with match between runs} + +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} + +\item{calculateAnomalyScores}{Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.} + +\item{anomalyModelFeatures}{character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.} + +\item{anomalyModelFeatureTemporal}{character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: \code{mean_decrease}, \code{mean_increase}, \code{dispersion_increase}, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).} + +\item{removeMissingFeatures}{Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.} + +\item{anomalyModelFeatureCount}{Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.} + +\item{runOrder}{Temporal order of MS runs. Should be a two column data.table with columns \code{Run} and \code{Order}, where \code{Run} matches the run name output by Spectronaut and \code{Order} is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.} + +\item{n_trees}{Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.} + +\item{max_depth}{Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.} + +\item{numberOfCores}{Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.} + \item{use_log_file}{logical. If TRUE, information about data processing will be saved to a file.} @@ -65,12 +98,6 @@ data processing will be saved. If not provided, such a file will be created automatically. If \code{append = TRUE}, has to be a valid path to a file.} -\item{MBR}{True if analysis was done with match between runs} - -\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. -Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. -Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} - \item{...}{additional parameters to \code{data.table::fread}.} } \value{ diff --git a/man/MSstatsAnomalyScores.Rd b/man/MSstatsAnomalyScores.Rd index bc2b18a3..b6a9d500 100644 --- a/man/MSstatsAnomalyScores.Rd +++ b/man/MSstatsAnomalyScores.Rd @@ -39,5 +39,9 @@ MSstatsAnomalyScores( data.table } \description{ -Run Anomaly Model +Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm. +This function identifies unusual precursor measurements based on quality metrics and their +temporal patterns. For features with insufficient quality metric data, it assigns anomaly +scores based on the median score of similar features (same peptide and charge combination). +The model supports parallel processing for improved performance on large datasets. } diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index 0e71ab7d..affbd147 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -66,7 +66,9 @@ MSstatsClean(msstats_object, ...) \S4method{MSstatsClean}{MSstatsDIANNFiles}( msstats_object, MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected" + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c() ) \S4method{MSstatsClean}{MSstatsMetamorpheusFiles}(msstats_object, MBR = TRUE, qvalue_cutoff = 0.05) diff --git a/man/dot-cleanRawDIANN.Rd b/man/dot-cleanRawDIANN.Rd index 118195e7..fa0e1da8 100644 --- a/man/dot-cleanRawDIANN.Rd +++ b/man/dot-cleanRawDIANN.Rd @@ -7,7 +7,9 @@ .cleanRawDIANN( msstats_object, MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected" + quantificationColumn = "FragmentQuantCorrected", + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c() ) } \arguments{ From c28b200d5c9b95130f42fa1860c0c495478773e1 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 18 Feb 2026 10:17:40 -0500 Subject: [PATCH 07/11] remove missing value split --- R/MSstatsConvert_core_functions.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 2cb2a941..b75eeba3 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -555,14 +555,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, } } - idx_all_missing = input[, - rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), - .SDcols = quality_metrics] + # idx_all_missing = input[, + # rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), + # .SDcols = quality_metrics] - input_missing_quality = input[idx_all_missing] - input_measured_quality = input[!idx_all_missing] + # input_missing_quality = input[idx_all_missing] + # input_measured_quality = input[!idx_all_missing] - input_measured_quality = .runAnomalyModel(input_measured_quality, + input = .runAnomalyModel(input, n_trees=n_trees, max_depth=max_depth, cores=cores, @@ -570,15 +570,15 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, quality_metrics=quality_metrics) # Calculate median anomaly score for each PSM - median_scores = input_measured_quality[, - .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] - input_missing_quality = merge( - input_missing_quality, median_scores, by = "PSM", all.x = TRUE) - input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore - input_missing_quality$MedianAnomalyScore = NULL + # median_scores = input_measured_quality[, + # .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] + # input_missing_quality = merge( + # input_missing_quality, median_scores, by = "PSM", all.x = TRUE) + # input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore + # input_missing_quality$MedianAnomalyScore = NULL # Combine measured and missing quality data - input = rbind(input_measured_quality, input_missing_quality, fill = TRUE) + # input = rbind(input_measured_quality, input_missing_quality, fill = TRUE) subset_cols = c("Run", "ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", From 6a764d05fafeda517a7afbd60b7b43b5ff0d5fa7 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 18 Feb 2026 10:41:33 -0500 Subject: [PATCH 08/11] rollback --- R/MSstatsConvert_core_functions.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index b75eeba3..2cb2a941 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -555,14 +555,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, } } - # idx_all_missing = input[, - # rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), - # .SDcols = quality_metrics] + idx_all_missing = input[, + rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), + .SDcols = quality_metrics] - # input_missing_quality = input[idx_all_missing] - # input_measured_quality = input[!idx_all_missing] + input_missing_quality = input[idx_all_missing] + input_measured_quality = input[!idx_all_missing] - input = .runAnomalyModel(input, + input_measured_quality = .runAnomalyModel(input_measured_quality, n_trees=n_trees, max_depth=max_depth, cores=cores, @@ -570,15 +570,15 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, quality_metrics=quality_metrics) # Calculate median anomaly score for each PSM - # median_scores = input_measured_quality[, - # .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] - # input_missing_quality = merge( - # input_missing_quality, median_scores, by = "PSM", all.x = TRUE) - # input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore - # input_missing_quality$MedianAnomalyScore = NULL + median_scores = input_measured_quality[, + .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] + input_missing_quality = merge( + input_missing_quality, median_scores, by = "PSM", all.x = TRUE) + input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore + input_missing_quality$MedianAnomalyScore = NULL # Combine measured and missing quality data - # input = rbind(input_measured_quality, input_missing_quality, fill = TRUE) + input = rbind(input_measured_quality, input_missing_quality, fill = TRUE) subset_cols = c("Run", "ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", From 67a50908041d81ec1d1f3df84b504d504f27ab8e Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Sat, 21 Feb 2026 08:06:40 -0500 Subject: [PATCH 09/11] updated anomaly calc --- R/MSstatsConvert_core_functions.R | 20 +------------------- R/utils_anomaly_score.R | 8 +++----- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 2cb2a941..c5e98c77 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -555,30 +555,12 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction, } } - idx_all_missing = input[, - rowSums(is.na(as.matrix(.SD))) == length(quality_metrics), - .SDcols = quality_metrics] - - input_missing_quality = input[idx_all_missing] - input_measured_quality = input[!idx_all_missing] - - input_measured_quality = .runAnomalyModel(input_measured_quality, + input = .runAnomalyModel(input, n_trees=n_trees, max_depth=max_depth, cores=cores, split_column="PSM", quality_metrics=quality_metrics) - - # Calculate median anomaly score for each PSM - median_scores = input_measured_quality[, - .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM] - input_missing_quality = merge( - input_missing_quality, median_scores, by = "PSM", all.x = TRUE) - input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore - input_missing_quality$MedianAnomalyScore = NULL - - # Combine measured and missing quality data - input = rbind(input_measured_quality, input_missing_quality, fill = TRUE) subset_cols = c("Run", "ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R index ee9badd1..d5981732 100644 --- a/R/utils_anomaly_score.R +++ b/R/utils_anomaly_score.R @@ -244,11 +244,9 @@ } ) - model_results = unlist(model_results) - # Clip anomaly scores to stop them from exploding - model_input$AnomalyScores = pmax(model_results, .001) - - # browser() + model_input$AnomalyScores = unlist(model_results) + input_data = merge(input_data, model_input, by = c(split_column, quality_metrics), all.x = TRUE) + return(input_data) } From 692546b729a2ccfc9e7f9a6a67a00402b41e841b Mon Sep 17 00:00:00 2001 From: Devon Kohler <35807256+devonjkohler@users.noreply.github.com> Date: Mon, 2 Mar 2026 10:14:39 -0500 Subject: [PATCH 10/11] Update R/converters_DIANNtoMSstatsFormat.R Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- R/converters_DIANNtoMSstatsFormat.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R index 806ce95b..82268c8a 100644 --- a/R/converters_DIANNtoMSstatsFormat.R +++ b/R/converters_DIANNtoMSstatsFormat.R @@ -28,7 +28,7 @@ #' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL). #' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE. #' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE. -#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. +#' `@param` runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. #' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100. #' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer. #' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1. From 004766b721e7cb363966b73bcc55e139f9c1097d Mon Sep 17 00:00:00 2001 From: Devon Kohler <35807256+devonjkohler@users.noreply.github.com> Date: Mon, 2 Mar 2026 10:23:12 -0500 Subject: [PATCH 11/11] Fix formatting of runOrder parameter documentation --- R/converters_DIANNtoMSstatsFormat.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R index aa80b2f6..833990f6 100644 --- a/R/converters_DIANNtoMSstatsFormat.R +++ b/R/converters_DIANNtoMSstatsFormat.R @@ -27,7 +27,7 @@ #' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL). #' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE. #' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE. -#' `@param` runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. +#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. #' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100. #' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer. #' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1. @@ -145,4 +145,4 @@ DIANNtoMSstatsFormat = function( getOption("MSstatsMsg")("INFO", msg_final) getOption("MSstatsLog")("INFO", "\n") input -} \ No newline at end of file +}