From 4f6941d77a500e5c8ee8dfdbc7355099f920c0a4 Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Tue, 13 Jan 2026 10:05:39 -0500
Subject: [PATCH 01/11] DIANN MSstatsplus converter

---
 .gitignore                          |  4 +-
 R/clean_DIANN.R                     | 20 +++++--
 R/converters_DIANNtoMSstatsFormat.R | 82 ++++++++++++++++++++---------
 3 files changed, 75 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4f1fdc64..9d0335a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,6 @@ inst/doc
 *.log
 *.o
 *.so
-*.dll
\ No newline at end of file
+*.dll
+.lintr
+.vscode
\ No newline at end of file
diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index 2c8a46b7..83f24aff 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -8,7 +8,9 @@
 #' @importFrom stats na.omit
 #' @keywords internal
 .cleanRawDIANN <- function(msstats_object, MBR = TRUE, 
-                           quantificationColumn = "FragmentQuantCorrected") {
+                           quantificationColumn = "FragmentQuantCorrected",
+                           calculateAnomalyScores = FALSE, 
+                           anomalyModelFeatures = c()) {
     dn_input <- getInputFile(msstats_object, "input")
     dn_input <- data.table::as.data.table(dn_input)
     
@@ -19,7 +21,9 @@
     dn_input <- .cleanDIANNAddMissingColumns(dn_input)
     
     # Select required columns
-    dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR)
+    dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR,
+                                                 calculateAnomalyScores,
+                                                 anomalyModelFeatures)
     
     # Split concatenated values
     dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)
@@ -75,7 +79,9 @@
 #' @param MBR logical indicating if match between runs was used
 #' @return data.table with selected columns
 #' @noRd
-.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) {
+.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR,
+                                             calculateAnomalyScores,
+                                             anomalyModelFeatures) {
     base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', 
                    'PrecursorCharge', quantificationColumn, 'QValue', 
                    'PrecursorMz', 'FragmentInfo', 'Run')
@@ -86,7 +92,13 @@
         c('GlobalQValue', 'GlobalPGQValue')
     }
     
-    req_cols <- c(base_cols, mbr_cols)
+    qual_cols <- if (calculateAnomalyScores) {
+        anomalyModelFeatures
+    } else {
+        c()
+    }
+
+    req_cols <- c(base_cols, mbr_cols, qual_cols)
     return(dn_input[, req_cols, with = FALSE])
 }
 
diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R
index 22d6223a..806ce95b 100644
--- a/R/converters_DIANNtoMSstatsFormat.R
+++ b/R/converters_DIANNtoMSstatsFormat.R
@@ -23,6 +23,15 @@
 #' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
 #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. 
 #' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
+#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
+#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
+#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
+#' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
+#' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
+#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
+#' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.
+#' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.
+#' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.
 #' @param ... additional parameters to `data.table::fread`.
 #'  
 #' @return data.frame in the MSstats required format.
@@ -52,26 +61,36 @@
 #' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE, 
 #'                                 use_log_file = FALSE, quantificationColumn = 'auto')
 #' head(output_2_0)
-DIANNtoMSstatsFormat = function(input, annotation = NULL,
-                                global_qvalue_cutoff = 0.01,
-                                qvalue_cutoff = 0.01, 
-                                pg_qvalue_cutoff = 0.01,
-                                useUniquePeptide = TRUE, 
-                                removeFewMeasurements = TRUE,
-                                removeOxidationMpeptides = TRUE, 
-                                removeProtein_with1Feature = TRUE,
-                                use_log_file = TRUE, append = FALSE, 
-                                verbose = TRUE, log_file_path = NULL,
-                                MBR = TRUE, 
-                                quantificationColumn = "FragmentQuantCorrected",
-                                ...) {
+DIANNtoMSstatsFormat = function(
+    input, annotation = NULL,
+    global_qvalue_cutoff = 0.01,
+    qvalue_cutoff = 0.01, 
+    pg_qvalue_cutoff = 0.01,
+    useUniquePeptide = TRUE, 
+    removeFewMeasurements = TRUE,
+    removeOxidationMpeptides = TRUE, 
+    removeProtein_with1Feature = TRUE,
+    MBR = TRUE, 
+    quantificationColumn = "FragmentQuantCorrected",
+    calculateAnomalyScores=FALSE, anomalyModelFeatures=c(),
+    anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5,
+    anomalyModelFeatureCount=100,
+    runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1, 
+    use_log_file = TRUE, append = FALSE, 
+    verbose = TRUE, log_file_path = NULL,
+    ...) {
     MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, 
                                         log_file_path)
     
+    anomalyModelFeatures = .standardizeColnames(anomalyModelFeatures)
+
     input = MSstatsConvert::MSstatsImport(list(input = input),
                                           "MSstats", "DIANN")
-    input = MSstatsConvert::MSstatsClean(input, MBR = MBR, 
-                                         quantificationColumn = quantificationColumn)
+    # browser()
+    input = MSstatsConvert::MSstatsClean(
+        input, MBR = MBR, quantificationColumn = quantificationColumn,
+        calculateAnomalyScores = calculateAnomalyScores, 
+        anomalyModelFeatures = anomalyModelFeatures)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
     
     decoy_filter = list(col_name = "ProteinName",
@@ -86,14 +105,15 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
     msg = paste0('** Filtering on Global Q Value < ', global_qvalue_cutoff)
     getOption("MSstatsLog")("INFO", msg)
     getOption("MSstatsMsg")("INFO", msg)
-    
-    input = input[DetectionQValue < global_qvalue_cutoff, ]
+    # browser()
+    input[DetectionQValue >= global_qvalue_cutoff, Intensity := 0]
+
     if (MBR) {
         msg = '** MBR was used to analyze the data. Now setting names and filtering'
         msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff)
         msg_2_mbr = paste0('-- LibQValue < ', qvalue_cutoff)
-        input = input[LibPGQValue < pg_qvalue_cutoff, ]
-        input = input[LibQValue < qvalue_cutoff, ]
+        input = input[LibPGQValue >= pg_qvalue_cutoff, Intensity := 0]
+        input = input[LibQValue >= qvalue_cutoff, Intensity := 0]
         getOption("MSstatsLog")("INFO", msg)
         getOption("MSstatsMsg")("INFO", msg)
         getOption("MSstatsLog")("INFO", msg_1_mbr)
@@ -105,8 +125,8 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
         msg = '** MBR was not used to analyze the data. Now setting names and filtering'
         msg_1 = paste0('-- Filtering on GlobalPGQValue < ', pg_qvalue_cutoff)
         msg_2 = paste0('-- Filtering on GlobalQValue < ', qvalue_cutoff)
-        input = input[GlobalPGQValue < pg_qvalue_cutoff, ]
-        input = input[GlobalQValue < qvalue_cutoff, ]
+        input = input[GlobalPGQValue >= pg_qvalue_cutoff, Intensity := 0]
+        input = input[GlobalQValue >= qvalue_cutoff, Intensity := 0]
         getOption("MSstatsLog")("INFO", msg)
         getOption("MSstatsMsg")("INFO", msg)
         getOption("MSstatsLog")("INFO", msg_1)
@@ -118,6 +138,7 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
     
     feature_columns = c("PeptideSequence", "PrecursorCharge",
                         "FragmentIon", "ProductCharge")
+    # browser()
     input = MSstatsConvert::MSstatsPreprocess(
         input, 
         annotation, 
@@ -132,15 +153,24 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
             remove_features_with_few_measurements = removeFewMeasurements,
             summarize_multiple_psms = max),
         columns_to_fill = list(Fraction = 1,
-                               IsotopeLabelType = "Light"))
+                               IsotopeLabelType = "Light"),
+        anomaly_metrics = anomalyModelFeatures)
     input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
-    
+    # browser()
     input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, 
-                                                  fill_incomplete = FALSE,
+                                                  fill_incomplete = TRUE,
                                                   handle_fractions = FALSE,
-                                                  remove_few = removeFewMeasurements
+                                                  remove_few = removeFewMeasurements,
+                                                  anomaly_metrics = anomalyModelFeatures
     )
-    
+    # browser()
+    if (calculateAnomalyScores){
+        input = MSstatsConvert::MSstatsAnomalyScores(
+            input, anomalyModelFeatures, anomalyModelFeatureTemporal,
+            removeMissingFeatures, anomalyModelFeatureCount, runOrder, n_trees, 
+            max_depth, numberOfCores)
+    }
+    # browser()
     msg_final = paste("** Finished preprocessing. The dataset is ready",
                       "to be processed by the dataProcess function.")
     getOption("MSstatsLog")("INFO", msg_final)

From 46bfc89a625cdbcb2617d24da2eaf7d10bccf5d3 Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Tue, 13 Jan 2026 10:05:54 -0500
Subject: [PATCH 02/11] minor bug fixes to anomaly detection model

---
 R/utils_anomaly_score.R  | 192 ++++++++++++++++++++++++++--------
 src/isolation_forest.cpp | 215 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 364 insertions(+), 43 deletions(-)

diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R
index d604f0b3..753e13b7 100644
--- a/R/utils_anomaly_score.R
+++ b/R/utils_anomaly_score.R
@@ -79,68 +79,174 @@
 
 #' Calculate mean increase
 #' @noRd
-.add_mean_increase = function(quality_vector){
-    
-    mean_increase = numeric(length(quality_vector))
-    mean_increase[1] = 0
+.add_mean_increase = function(quality_vector) {
+
+    n = length(quality_vector)
+    mean_increase = rep(NA_real_, n)
+
     d = 0.5
-    
-    for(k in 2:length(quality_vector)) {
-        # 5 is reference (3 sigma)
-        if (mean_increase[k] > 5){
-            mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE)
-        } else {
-            mean_increase[k] = max(0,
-                                   (quality_vector[k] - d + mean_increase[k-1]),
-                                   na.rm = TRUE) # positive CuSum
+    h = 5
+
+    state = 0
+    mean_increase[1] = if (is.na(quality_vector[1])) NA_real_ else 0
+
+    if (n >= 2) {
+        for (k in 2:n) {
+
+            if (is.na(quality_vector[k])) {
+                mean_increase[k] = NA_real_   # output NA at missing
+                next                          # state unchanged (carry forward)
+            }
+
+            if (state > h) {
+                state = max(0, quality_vector[k] - d)
+            } else {
+                state = max(0, quality_vector[k] - d + state)
+            }
+
+            mean_increase[k] = state
         }
     }
-    return(mean_increase)
+
+    mean_increase
 }
+# .add_mean_increase = function(quality_vector){
+    
+#     mean_increase = numeric(length(quality_vector))
+#     mean_increase[1] = 0
+#     d = 0.5
+    
+#     for(k in 2:length(quality_vector)) {
+#         # 5 is reference (3 sigma)
+#         if (mean_increase[k - 1] > 5){
+#             mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE)
+#         } else {
+#             mean_increase[k] = max(0,
+#                                    (quality_vector[k] - d + mean_increase[k-1]),
+#                                    na.rm = TRUE) # positive CuSum
+#         }
+#     }
+#     return(mean_increase)
+# }
 
 #' Calculate mean decrease
 #' @noRd
-.add_mean_decrease = function(quality_vector){
-    
-    mean_decrease = numeric(length(quality_vector))
-    mean_decrease[1] = 0
+.add_mean_decrease = function(quality_vector) {
+
+    n = length(quality_vector)
+    mean_decrease = rep(NA_real_, n)
+
     d = -0.5
-    
-    for(k in 2:length(quality_vector)) {
-        # 5 is reference (3 sigma)
-        if (mean_decrease[k]>5){
-            mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE)
-        } else {
-            mean_decrease[k] <- max(0,
-                                    (d - quality_vector[k] + mean_decrease[k-1]),
-                                    na.rm = TRUE) # negative CuSum
+    h = 5
+
+    state = 0
+    mean_decrease[1] =
+        if (is.na(quality_vector[1])) NA_real_ else 0
+
+    if (n >= 2) {
+        for (k in 2:n) {
+
+            if (is.na(quality_vector[k])) {
+                mean_decrease[k] = NA_real_   # mark missing
+                next                          # carry state forward
+            }
+
+            if (state > h) {
+                state = max(0, d - quality_vector[k])
+            } else {
+                state = max(0, d - quality_vector[k] + state)
+            }
+
+            mean_decrease[k] = state
         }
     }
-    return(mean_decrease)
+
+    mean_decrease
 }
+# .add_mean_decrease = function(quality_vector){
+    
+#     mean_decrease = numeric(length(quality_vector))
+#     mean_decrease[1] = 0
+#     d = -0.5
+    
+#     for(k in 2:length(quality_vector)) {
+#         # 5 is reference (3 sigma)
+#         if (mean_decrease[k - 1] > 5){
+#             mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE)
+#         } else {
+#             mean_decrease[k] <- max(0,
+#                                     (d - quality_vector[k] + mean_decrease[k-1]),
+#                                     na.rm = TRUE) # negative CuSum
+#         }
+#     }
+#     return(mean_decrease)
+# }
 
 #' Calculate dispersion increase
 #' @noRd
-.add_dispersion_increase = function(quality_vector){
-    dispersion_increase = numeric(length(quality_vector))
-    v = numeric(length(quality_vector))
-    v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349
+.add_dispersion_increase = function(quality_vector) {
+
+    n = length(quality_vector)
+    dispersion_increase = rep(NA_real_, n)
+    v = rep(NA_real_, n)
+
     d = 0.5
-    for(k in 2:length(quality_vector)) {
-        
-        v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 
-        
-        if (dispersion_increase[k] > 5){
-            dispersion_increase[k] = max(0,(v[k] - d),
-                                         na.rm = TRUE)
-        } else {
-            dispersion_increase[k] = max(0, 
-                                         (v[k] - d + dispersion_increase[k-1]),
-                                         na.rm = TRUE) # CuSum variance
+    h = 5
+
+    # initialize state
+    state = 0
+
+    if (!is.na(quality_vector[1])) {
+        v[1] = (sqrt(abs(quality_vector[1])) - 0.822) / 0.349
+        dispersion_increase[1] = 0
+    } else {
+        v[1] = NA_real_
+        dispersion_increase[1] = NA_real_
+    }
+
+    if (n >= 2) {
+        for (k in 2:n) {
+
+            if (is.na(quality_vector[k])) {
+                v[k] = NA_real_
+                dispersion_increase[k] = NA_real_  # mark missing
+                next                               # carry state forward
+            }
+
+            v[k] = (sqrt(abs(quality_vector[k])) - 0.822) / 0.349
+
+            if (state > h) {
+                state = max(0, v[k] - d)
+            } else {
+                state = max(0, v[k] - d + state)
+            }
+
+            dispersion_increase[k] = state
         }
     }
+
     return(dispersion_increase)
 }
+# .add_dispersion_increase = function(quality_vector){
+#     dispersion_increase = numeric(length(quality_vector))
+#     v = numeric(length(quality_vector))
+#     v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349
+#     d = 0.5
+#     for(k in 2:length(quality_vector)) {
+        
+#         v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 
+        
+#         if (dispersion_increase[k - 1] > 5){
+#             dispersion_increase[k] = max(0,(v[k] - d),
+#                                          na.rm = TRUE)
+#         } else {
+#             dispersion_increase[k] = max(0, 
+#                                          (v[k] - d + dispersion_increase[k-1]),
+#                                          na.rm = TRUE) # CuSum variance
+#         }
+#     }
+#     return(dispersion_increase)
+# }
 
 #' Train isolation forest model in parallel
 #' @import parallel
diff --git a/src/isolation_forest.cpp b/src/isolation_forest.cpp
index f1abac22..1f1972a7 100644
--- a/src/isolation_forest.cpp
+++ b/src/isolation_forest.cpp
@@ -201,6 +201,221 @@ DataFrame calculate_anomaly_score(
     );
 }
 
+// #include <vector>
+// #include <string>
+// #include <random>
+// #include <memory>
+// #include <unordered_map>
+// #include <cmath>
+// #include <Rcpp.h>
+// // #include <RcppParallel.h>
+
+// using namespace Rcpp;
+// // using namespace RcppParallel;
+
+
+// const double EM_CONSTANT = 0.5772156649; // Euler-Mascheroni constant
+// std::mt19937 gen(42); // Fixed random seed for reproducibility
+
+// // Define a structure for an Isolation Tree Node
+// struct IsolationTreeNode {
+//   bool is_leaf;
+//   int size;
+//   std::string feature;
+//   double value;
+//   bool is_missing_split;
+//   std::unique_ptr<IsolationTreeNode> left;
+//   std::unique_ptr<IsolationTreeNode> right;
+  
+//   IsolationTreeNode(int s) : is_leaf(true), size(s), value(0), is_missing_split(false) {}
+//   IsolationTreeNode(
+//     std::string feat, 
+//     double val, 
+//     bool missing_split, 
+//     std::unique_ptr<IsolationTreeNode> l, 
+//     std::unique_ptr<IsolationTreeNode> r)
+//     : is_leaf(false), size(0), 
+//       feature(std::move(feat)), 
+//       value(val), is_missing_split(missing_split), 
+//       left(std::move(l)), right(std::move(r)) {}
+// };
+
+// // Convert R DataFrame to C++ data structure
+// std::vector<std::unordered_map<std::string, double>> convert_dataframe(const DataFrame& df) {
+//     std::vector<std::unordered_map<std::string, double>> data;
+//     int n_rows = df.nrows();
+//     CharacterVector col_names = df.names();
+//     int n_cols = col_names.size();
+    
+//     for (int i = 0; i < n_rows; ++i) {
+//       std::unordered_map<std::string, double> row;
+//       for (int j = 0; j < n_cols; ++j) {
+//         NumericVector col = df[j];
+//         row[as<std::string>(col_names[j])] = col[i];
+//       }
+//       data.push_back(row);
+//     }
+//     return data;
+// }
+
+// // Function to create an isolation tree
+// std::unique_ptr<IsolationTreeNode> isolation_tree(
+//     const std::vector<std::unordered_map<std::string, double>>& data, 
+//     int depth = 0, int max_depth = -1) {
+  
+//   int n = data.size();
+//   if (max_depth == -1) {
+//     max_depth = static_cast<int>(log2(n));
+//   }
+//   if (n <= 1 || depth >= max_depth) {
+//     return std::make_unique<IsolationTreeNode>(n);
+//   }
+  
+//   // Create vector of features to split on
+//   std::vector<std::string> features;
+//   for (const auto& [key, _] : data[0]) {
+//     features.push_back(key);
+//   }
+  
+//   std::uniform_int_distribution<> feature_dist(0, features.size() - 1);
+//   std::string split_feature = features[feature_dist(gen)];
+  
+//   // Can split on numeric or missing value
+//   double min_val = data[0].at(split_feature);
+//   double max_val = min_val;
+//   int missing_count = 0;
+
+//   for (const auto& row : data) {
+//     double val = row.at(split_feature);
+//     if (std::isnan(val)) {
+//       missing_count++;
+//       continue;
+//     }
+//     min_val = std::min(min_val, val);
+//     max_val = std::max(max_val, val);
+//   }
+
+//   bool has_missing = (missing_count > 0);
+//   double missing_fraction =
+//       static_cast<double>(missing_count) / static_cast<double>(n);
+  
+//   if (min_val == max_val && !has_missing) {
+//     return std::make_unique<IsolationTreeNode>(n);
+//   }
+  
+//   // --- NEW: calibrated missing split probability ---
+//   double p_miss = std::min(0.5, missing_fraction);
+
+//   bool is_missing_split = false;
+//   double split_value = 0.0;
+
+//   if (has_missing &&
+//       (std::bernoulli_distribution(p_miss)(gen) || min_val == max_val)) {
+//     is_missing_split = true;
+//   } else {
+//     std::uniform_real_distribution<> split_dist(min_val, max_val);
+//     split_value = split_dist(gen);
+//   }
+  
+//   std::vector<std::unordered_map<std::string, double>> left_data, right_data;
+//   for (const auto& row : data) {
+//     if (is_missing_split) {
+//       if (std::isnan(row.at(split_feature))) {
+//         left_data.push_back(row);
+//       } else {
+//         right_data.push_back(row);
+//       }
+//     } else {
+//       if (row.at(split_feature) < split_value) {
+//         left_data.push_back(row);
+//       } else {
+//         right_data.push_back(row);
+//       }
+//     }
+//   }
+  
+//   return std::make_unique<IsolationTreeNode>(
+//     split_feature, split_value, is_missing_split,
+//     isolation_tree(left_data, depth + 1, max_depth),
+//     isolation_tree(right_data, depth + 1, max_depth)
+//   );
+// }
+
+// // Train an isolation forest
+// std::vector<std::unique_ptr<IsolationTreeNode>> isolation_forest(
+//         const std::vector<std::unordered_map<std::string, double>>& data, 
+//         int n_trees, 
+//         int max_depth
+// ) {
+//     std::vector<std::unique_ptr<IsolationTreeNode>> forest;
+//     forest.reserve(n_trees);
+    
+//     for (int i = 0; i < n_trees; ++i) {
+//         forest.push_back(isolation_tree(data, 0, max_depth));
+//     }
+    
+//     return forest;
+// }
+
+// // Compute path length of a single observation in a tree
+// int path_length(const IsolationTreeNode* node, const std::unordered_map<std::string, double>& obs, int depth = 0) {
+//     if (node->is_leaf) {
+//         return depth + node->size;
+//     }
+    
+//     auto it = obs.find(node->feature);
+//     if (it == obs.end()) {
+//         return depth; // If feature is missing, return current depth
+//     }
+    
+//     if (it->second < node->value) {
+//         return path_length(node->left.get(), obs, depth + 1);
+//     } else {
+//         return path_length(node->right.get(), obs, depth + 1);
+//     }
+// }
+
+// // Compute anomaly scores for data
+// // [[Rcpp::export]]
+// DataFrame calculate_anomaly_score(
+//         DataFrame df, 
+//         int n_trees, 
+//         int max_depth
+// ) {
+    
+//     std::vector<std::unordered_map<std::string, double>> data = convert_dataframe(df);
+    
+//     int n = data.size();
+    
+//     // Generate the forest
+//     auto forest = isolation_forest(data, n_trees, max_depth);
+    
+//     // Compute average path lengths
+//     std::vector<double> avg_path_length(n, 0.0);
+//     for (int i = 0; i < n; ++i) {
+//         double total_path_length = 0.0;
+//         for (const auto& tree : forest) {
+//             total_path_length += path_length(tree.get(), data[i]);
+//         }
+//         avg_path_length[i] = total_path_length / n_trees;
+//     }
+    
+//     // Compute anomaly scores
+//     double c_n = 2 * (log(n - 1) + EM_CONSTANT) - (2 * (n - 1) / n);
+//     std::vector<double> scores(n);
+//     for (int i = 0; i < n; ++i) {
+//         scores[i] = std::pow(2, -(avg_path_length[i] / c_n));
+//     }
+    
+//     // Return DataFrame with results
+//     return DataFrame::create(
+//         _["avg_depth"] = avg_path_length,
+//         _["anomaly_score"] = scores
+//     );
+// }
+
+// TODO: Parallelization would be faster within RcppParallel framework
+// but requires setup and testing.
 // Parallel task struct to process each data frame
 // struct ParallelTask : public Worker {
 //     // Input list of data frames

From 2e0f812a765a4900d261ded47e1fcfd87cf6c133 Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Mon, 9 Feb 2026 08:54:26 -0500
Subject: [PATCH 03/11] diann missing values

---
 R/MSstatsConvert_core_functions.R |  24 ++-
 R/utils_anomaly_score.R           |   1 +
 src/isolation_forest.cpp          | 286 +++++++++++++++++-------------
 3 files changed, 190 insertions(+), 121 deletions(-)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index 8f06973c..27ce3678 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -548,14 +548,34 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
                                        temporal_direction[i]))
         }
     }
-    
-    input = .runAnomalyModel(input, 
+
+
+    idx_all_missing = input[, 
+        rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
+        .SDcols = quality_metrics]
+
+    input_missing_quality = input[idx_all_missing]
+    input_measured_quality = input[!idx_all_missing]
+
+
+    input_measured_quality = .runAnomalyModel(input_measured_quality, 
                              n_trees=n_trees, 
                              max_depth=max_depth, 
                              cores=cores,
                              split_column="PSM",
                              quality_metrics=quality_metrics)
     
+    # Calculate median anomaly score for each PSM
+    median_scores = input_measured_quality[, 
+        .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
+    input_missing_quality = merge(
+        input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
+    input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
+    input_missing_quality$MedianAnomalyScore = NULL
+
+    # Combine measured and missing quality data
+    input = rbind(input_measured_quality, input_missing_quality, fill = TRUE)
+
     subset_cols = c("Run", "ProteinName", "PeptideSequence", 
                     "PrecursorCharge", "FragmentIon", 
                     "ProductCharge", "IsotopeLabelType", 
diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R
index 753e13b7..a048cdda 100644
--- a/R/utils_anomaly_score.R
+++ b/R/utils_anomaly_score.R
@@ -278,6 +278,7 @@
     cat(paste0("Number of PSMs to process: ", num_psm), 
         sep = "\n", file = "MSstats_anomaly_model_progress.log")
     
+    # input_data = na.omit(input_data)
     model_results = parallel::parLapply(
         cl, seq_len(num_psm), 
         function(i){
diff --git a/src/isolation_forest.cpp b/src/isolation_forest.cpp
index 1f1972a7..7ac6dae9 100644
--- a/src/isolation_forest.cpp
+++ b/src/isolation_forest.cpp
@@ -208,16 +208,12 @@ DataFrame calculate_anomaly_score(
 // #include <unordered_map>
 // #include <cmath>
 // #include <Rcpp.h>
-// // #include <RcppParallel.h>
 
 // using namespace Rcpp;
-// // using namespace RcppParallel;
-
 
 // const double EM_CONSTANT = 0.5772156649; // Euler-Mascheroni constant
 // std::mt19937 gen(42); // Fixed random seed for reproducibility
 
-// // Define a structure for an Isolation Tree Node
 // struct IsolationTreeNode {
 //   bool is_leaf;
 //   int size;
@@ -226,84 +222,110 @@ DataFrame calculate_anomaly_score(
 //   bool is_missing_split;
 //   std::unique_ptr<IsolationTreeNode> left;
 //   std::unique_ptr<IsolationTreeNode> right;
-  
-//   IsolationTreeNode(int s) : is_leaf(true), size(s), value(0), is_missing_split(false) {}
+
+//   IsolationTreeNode(int s)
+//     : is_leaf(true),
+//       size(s),
+//       feature(""),
+//       value(0.0),
+//       is_missing_split(false),
+//       left(nullptr),
+//       right(nullptr) {}
+
 //   IsolationTreeNode(
-//     std::string feat, 
-//     double val, 
-//     bool missing_split, 
-//     std::unique_ptr<IsolationTreeNode> l, 
-//     std::unique_ptr<IsolationTreeNode> r)
-//     : is_leaf(false), size(0), 
-//       feature(std::move(feat)), 
-//       value(val), is_missing_split(missing_split), 
-//       left(std::move(l)), right(std::move(r)) {}
+//     std::string feat,
+//     double val,
+//     bool missing_split,
+//     std::unique_ptr<IsolationTreeNode> l,
+//     std::unique_ptr<IsolationTreeNode> r
+//   )
+//     : is_leaf(false),
+//       size(0),
+//       feature(std::move(feat)),
+//       value(val),
+//       is_missing_split(missing_split),
+//       left(std::move(l)),
+//       right(std::move(r)) {}
 // };
 
-// // Convert R DataFrame to C++ data structure
-// std::vector<std::unordered_map<std::string, double>> convert_dataframe(const DataFrame& df) {
-//     std::vector<std::unordered_map<std::string, double>> data;
-//     int n_rows = df.nrows();
-//     CharacterVector col_names = df.names();
-//     int n_cols = col_names.size();
-    
-//     for (int i = 0; i < n_rows; ++i) {
-//       std::unordered_map<std::string, double> row;
-//       for (int j = 0; j < n_cols; ++j) {
-//         NumericVector col = df[j];
-//         row[as<std::string>(col_names[j])] = col[i];
-//       }
-//       data.push_back(row);
+// std::vector<std::unordered_map<std::string, double>>
+// convert_dataframe(const DataFrame& df) {
+//   std::vector<std::unordered_map<std::string, double>> data;
+
+//   int n_rows = df.nrows();
+//   CharacterVector col_names = df.names();
+//   int n_cols = col_names.size();
+
+//   data.reserve(n_rows);
+
+//   for (int i = 0; i < n_rows; ++i) {
+//     std::unordered_map<std::string, double> row;
+//     row.reserve(n_cols);
+
+//     for (int j = 0; j < n_cols; ++j) {
+//       NumericVector col = df[j];
+//       row[as<std::string>(col_names[j])] = col[i];
 //     }
-//     return data;
+//     data.push_back(std::move(row));
+//   }
+
+//   return data;
 // }
 
-// // Function to create an isolation tree
-// std::unique_ptr<IsolationTreeNode> isolation_tree(
-//     const std::vector<std::unordered_map<std::string, double>>& data, 
-//     int depth = 0, int max_depth = -1) {
-  
-//   int n = data.size();
+// std::unique_ptr<IsolationTreeNode>
+// isolation_tree(
+//   const std::vector<std::unordered_map<std::string, double>>& data,
+//   int depth = 0,
+//   int max_depth = -1
+// ) {
+//   int n = static_cast<int>(data.size());
+
 //   if (max_depth == -1) {
-//     max_depth = static_cast<int>(log2(n));
+//     max_depth = static_cast<int>(std::log2(n));
 //   }
+
 //   if (n <= 1 || depth >= max_depth) {
 //     return std::make_unique<IsolationTreeNode>(n);
 //   }
-  
-//   // Create vector of features to split on
+
 //   std::vector<std::string> features;
-//   for (const auto& [key, _] : data[0]) {
-//     features.push_back(key);
+//   features.reserve(data[0].size());
+
+//   for (const auto& kv : data[0]) {
+//     features.push_back(kv.first);
 //   }
-  
-//   std::uniform_int_distribution<> feature_dist(0, features.size() - 1);
-//   std::string split_feature = features[feature_dist(gen)];
-  
-//   // Can split on numeric or missing value
+
+//   std::uniform_int_distribution<> feat_dist(0, features.size() - 1);
+//   std::string split_feature = features[feat_dist(gen)];
+
 //   double min_val = data[0].at(split_feature);
 //   double max_val = min_val;
 //   int missing_count = 0;
 
 //   for (const auto& row : data) {
 //     double val = row.at(split_feature);
+
 //     if (std::isnan(val)) {
 //       missing_count++;
 //       continue;
 //     }
+
 //     min_val = std::min(min_val, val);
 //     max_val = std::max(max_val, val);
 //   }
 
 //   bool has_missing = (missing_count > 0);
+
 //   double missing_fraction =
-//       static_cast<double>(missing_count) / static_cast<double>(n);
-  
+//     static_cast<double>(missing_count) / static_cast<double>(n);
+
 //   if (min_val == max_val && !has_missing) {
 //     return std::make_unique<IsolationTreeNode>(n);
 //   }
-  
-//   // --- NEW: calibrated missing split probability ---
+
+//   // Calibrated missing split probability:
+//   // - rare missing -> rare split (more anomalous)
+//   // - common missing -> more missing splits (less anomalous)
 //   double p_miss = std::min(0.5, missing_fraction);
 
 //   bool is_missing_split = false;
@@ -316,104 +338,130 @@ DataFrame calculate_anomaly_score(
 //     std::uniform_real_distribution<> split_dist(min_val, max_val);
 //     split_value = split_dist(gen);
 //   }
-  
-//   std::vector<std::unordered_map<std::string, double>> left_data, right_data;
+
+//   std::vector<std::unordered_map<std::string, double>> left_data;
+//   std::vector<std::unordered_map<std::string, double>> right_data;
+//   left_data.reserve(n);
+//   right_data.reserve(n);
+
 //   for (const auto& row : data) {
+//     double v = row.at(split_feature);
+
 //     if (is_missing_split) {
-//       if (std::isnan(row.at(split_feature))) {
+//       if (std::isnan(v)) {
 //         left_data.push_back(row);
 //       } else {
 //         right_data.push_back(row);
 //       }
 //     } else {
-//       if (row.at(split_feature) < split_value) {
+//       if (v < split_value) {
 //         left_data.push_back(row);
 //       } else {
 //         right_data.push_back(row);
 //       }
 //     }
 //   }
-  
+
 //   return std::make_unique<IsolationTreeNode>(
-//     split_feature, split_value, is_missing_split,
+//     split_feature,
+//     split_value,
+//     is_missing_split,
 //     isolation_tree(left_data, depth + 1, max_depth),
 //     isolation_tree(right_data, depth + 1, max_depth)
 //   );
 // }
 
-// // Train an isolation forest
-// std::vector<std::unique_ptr<IsolationTreeNode>> isolation_forest(
-//         const std::vector<std::unordered_map<std::string, double>>& data, 
-//         int n_trees, 
-//         int max_depth
+// std::vector<std::unique_ptr<IsolationTreeNode>>
+// isolation_forest(
+//   const std::vector<std::unordered_map<std::string, double>>& data,
+//   int n_trees,
+//   int max_depth
 // ) {
-//     std::vector<std::unique_ptr<IsolationTreeNode>> forest;
-//     forest.reserve(n_trees);
-    
-//     for (int i = 0; i < n_trees; ++i) {
-//         forest.push_back(isolation_tree(data, 0, max_depth));
-//     }
-    
-//     return forest;
+//   std::vector<std::unique_ptr<IsolationTreeNode>> forest;
+//   forest.reserve(n_trees);
+
+//   for (int i = 0; i < n_trees; ++i) {
+//     forest.push_back(isolation_tree(data, 0, max_depth));
+//   }
+
+//   return forest;
 // }
 
-// // Compute path length of a single observation in a tree
-// int path_length(const IsolationTreeNode* node, const std::unordered_map<std::string, double>& obs, int depth = 0) {
-//     if (node->is_leaf) {
-//         return depth + node->size;
-//     }
-    
-//     auto it = obs.find(node->feature);
-//     if (it == obs.end()) {
-//         return depth; // If feature is missing, return current depth
-//     }
-    
-//     if (it->second < node->value) {
-//         return path_length(node->left.get(), obs, depth + 1);
-//     } else {
-//         return path_length(node->right.get(), obs, depth + 1);
+// // IMPORTANT FIX: respect is_missing_split during traversal
+// int path_length(
+//   const IsolationTreeNode* node,
+//   const std::unordered_map<std::string, double>& obs,
+//   int depth = 0
+// ) {
+//   if (node->is_leaf) {
+//     return depth + node->size;
+//   }
+
+//   auto it = obs.find(node->feature);
+//   if (it == obs.end()) {
+//     // Feature absent from obs map. Fall back: stop at current depth.
+//     return depth;
+//   }
+
+//   double x = it->second;
+
+//   if (node->is_missing_split) {
+//     if (std::isnan(x)) {
+//       return path_length(node->left.get(), obs, depth + 1);
 //     }
+//     return path_length(node->right.get(), obs, depth + 1);
+//   }
+
+//   if (std::isnan(x)) {
+//     // Numeric split but x missing: choose a side deterministically.
+//     // Here: treat missing as going left.
+//     return path_length(node->left.get(), obs, depth + 1);
+//   }
+
+//   if (x < node->value) {
+//     return path_length(node->left.get(), obs, depth + 1);
+//   }
+
+//   return path_length(node->right.get(), obs, depth + 1);
 // }
 
-// // Compute anomaly scores for data
 // // [[Rcpp::export]]
-// DataFrame calculate_anomaly_score(
-//         DataFrame df, 
-//         int n_trees, 
-//         int max_depth
-// ) {
-    
-//     std::vector<std::unordered_map<std::string, double>> data = convert_dataframe(df);
-    
-//     int n = data.size();
-    
-//     // Generate the forest
-//     auto forest = isolation_forest(data, n_trees, max_depth);
-    
-//     // Compute average path lengths
-//     std::vector<double> avg_path_length(n, 0.0);
-//     for (int i = 0; i < n; ++i) {
-//         double total_path_length = 0.0;
-//         for (const auto& tree : forest) {
-//             total_path_length += path_length(tree.get(), data[i]);
-//         }
-//         avg_path_length[i] = total_path_length / n_trees;
-//     }
-    
-//     // Compute anomaly scores
-//     double c_n = 2 * (log(n - 1) + EM_CONSTANT) - (2 * (n - 1) / n);
-//     std::vector<double> scores(n);
-//     for (int i = 0; i < n; ++i) {
-//         scores[i] = std::pow(2, -(avg_path_length[i] / c_n));
+// DataFrame calculate_anomaly_score(DataFrame df, int n_trees, int max_depth) {
+//   std::vector<std::unordered_map<std::string, double>> data =
+//     convert_dataframe(df);
+
+//   int n = static_cast<int>(data.size());
+
+//   auto forest = isolation_forest(data, n_trees, max_depth);
+
+//   std::vector<double> avg_path_length(n, 0.0);
+
+//   for (int i = 0; i < n; ++i) {
+//     double total_path_length = 0.0;
+
+//     for (const auto& tree : forest) {
+//       total_path_length += path_length(tree.get(), data[i]);
 //     }
-    
-//     // Return DataFrame with results
-//     return DataFrame::create(
-//         _["avg_depth"] = avg_path_length,
-//         _["anomaly_score"] = scores
-//     );
+
+//     avg_path_length[i] = total_path_length / static_cast<double>(n_trees);
+//   }
+
+//   double c_n = 2.0 * (std::log(n - 1.0) + EM_CONSTANT) -
+//     (2.0 * (n - 1.0) / n);
+
+//   std::vector<double> scores(n, 0.0);
+
+//   for (int i = 0; i < n; ++i) {
+//     scores[i] = std::pow(2.0, -(avg_path_length[i] / c_n));
+//   }
+
+//   return DataFrame::create(
+//     _["avg_depth"] = avg_path_length,
+//     _["anomaly_score"] = scores
+//   );
 // }
 
+
 // TODO: Parallelization would be faster within RcppParallel framework
 // but requires setup and testing.
 // Parallel task struct to process each data frame

From 7ec875289f2627b7b747f080f23deb31e6f72075 Mon Sep 17 00:00:00 2001
From: Devon Kohler <devonjkohler@gmail.com>
Date: Wed, 18 Feb 2026 08:16:25 -0500
Subject: [PATCH 04/11] bug fixes to anomaly model

---
 R/MSstatsConvert_core_functions.R | 6 +++---
 R/utils_anomaly_score.R           | 9 +++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index 27ce3678..f104387a 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -535,7 +535,7 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
 MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
                                 missing_run_count, n_feat, run_order, n_trees, 
                                 max_depth, cores){
-    
+
     input = .prepareSpectronautAnomalyInput(input, quality_metrics, 
                                             run_order, n_feat, 
                                             missing_run_count)
@@ -549,7 +549,7 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
         }
     }
 
-
+    # browser() #AASDAIPPASPK
     idx_all_missing = input[, 
         rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
         .SDcols = quality_metrics]
@@ -567,7 +567,7 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
     
     # Calculate median anomaly score for each PSM
     median_scores = input_measured_quality[, 
-        .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
+        .(MedianAnomalyScore = mean(AnomalyScores, na.rm = TRUE)), by = PSM]
     input_missing_quality = merge(
         input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
     input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R
index a048cdda..33e91db9 100644
--- a/R/utils_anomaly_score.R
+++ b/R/utils_anomaly_score.R
@@ -278,6 +278,9 @@
     cat(paste0("Number of PSMs to process: ", num_psm), 
         sep = "\n", file = "MSstats_anomaly_model_progress.log")
     
+    # browser()
+    model_input = unique(input_data[, c(split_column, quality_metrics), with = FALSE])
+
     # input_data = na.omit(input_data)
     model_results = parallel::parLapply(
         cl, seq_len(num_psm), 
@@ -286,7 +289,7 @@
                 cat("Finished processing an additional 100 PSMs", 
                     sep = "\n", file = "MSstats_anomaly_model_progress.log", append = TRUE)
             }
-            single_psm = input_data[get(split_column) == psm_list[[i]], 
+            single_psm = model_input[model_input[[split_column]] == psm_list[[i]], 
                                     ..quality_metrics]
             
             if (max_depth == "auto"){
@@ -301,7 +304,9 @@
     
     model_results = unlist(model_results)
     # Clip anomaly scores to stop them from exploding
-    input_data$AnomalyScores = pmax(model_results, .001)
+    model_input$AnomalyScores = pmax(model_results, .001)
     
+    # browser()
+    input_data = merge(input_data, model_input, by = c(split_column, quality_metrics), all.x = TRUE)
     return(input_data)
 }

From ca4335c92dfebf3fe9a416b8ecb76a4583cc2322 Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Wed, 18 Feb 2026 09:24:37 -0500
Subject: [PATCH 05/11] minor documentation to anomaly score functions

---
 R/MSstatsConvert_core_functions.R | 10 ++++--
 R/utils_anomaly_score.R           | 58 -------------------------------
 2 files changed, 7 insertions(+), 61 deletions(-)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index f104387a..2cb2a941 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -518,6 +518,12 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
 }
 
 #' Run Anomaly Model
+#'
+#' Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm.
+#' This function identifies unusual precursor measurements based on quality metrics and their 
+#' temporal patterns. For features with insufficient quality metric data, it assigns anomaly 
+#' scores based on the median score of similar features (same peptide and charge combination).
+#' The model supports parallel processing for improved performance on large datasets.
 #' 
 #' @param input data.table preprocessed by the MSstatsBalancedDesign function
 #' @param quality_metrics character vector of quality metrics to use in the model
@@ -549,7 +555,6 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
         }
     }
 
-    # browser() #AASDAIPPASPK
     idx_all_missing = input[, 
         rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
         .SDcols = quality_metrics]
@@ -557,7 +562,6 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
     input_missing_quality = input[idx_all_missing]
     input_measured_quality = input[!idx_all_missing]
 
-
     input_measured_quality = .runAnomalyModel(input_measured_quality, 
                              n_trees=n_trees, 
                              max_depth=max_depth, 
@@ -567,7 +571,7 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
     
     # Calculate median anomaly score for each PSM
     median_scores = input_measured_quality[, 
-        .(MedianAnomalyScore = mean(AnomalyScores, na.rm = TRUE)), by = PSM]
+        .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
     input_missing_quality = merge(
         input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
     input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R
index 33e91db9..ee9badd1 100644
--- a/R/utils_anomaly_score.R
+++ b/R/utils_anomaly_score.R
@@ -110,24 +110,6 @@
 
     mean_increase
 }
-# .add_mean_increase = function(quality_vector){
-    
-#     mean_increase = numeric(length(quality_vector))
-#     mean_increase[1] = 0
-#     d = 0.5
-    
-#     for(k in 2:length(quality_vector)) {
-#         # 5 is reference (3 sigma)
-#         if (mean_increase[k - 1] > 5){
-#             mean_increase[k] = max(0,(quality_vector[k] - d), na.rm = TRUE)
-#         } else {
-#             mean_increase[k] = max(0,
-#                                    (quality_vector[k] - d + mean_increase[k-1]),
-#                                    na.rm = TRUE) # positive CuSum
-#         }
-#     }
-#     return(mean_increase)
-# }
 
 #' Calculate mean decrease
 #' @noRd
@@ -163,24 +145,6 @@
 
     mean_decrease
 }
-# .add_mean_decrease = function(quality_vector){
-    
-#     mean_decrease = numeric(length(quality_vector))
-#     mean_decrease[1] = 0
-#     d = -0.5
-    
-#     for(k in 2:length(quality_vector)) {
-#         # 5 is reference (3 sigma)
-#         if (mean_decrease[k - 1] > 5){
-#             mean_decrease[k] <- max(0,(d - quality_vector[k] + 0), na.rm = TRUE)
-#         } else {
-#             mean_decrease[k] <- max(0,
-#                                     (d - quality_vector[k] + mean_decrease[k-1]),
-#                                     na.rm = TRUE) # negative CuSum
-#         }
-#     }
-#     return(mean_decrease)
-# }
 
 #' Calculate dispersion increase
 #' @noRd
@@ -227,26 +191,6 @@
 
     return(dispersion_increase)
 }
-# .add_dispersion_increase = function(quality_vector){
-#     dispersion_increase = numeric(length(quality_vector))
-#     v = numeric(length(quality_vector))
-#     v[1] = (sqrt(abs(quality_vector[1]))-0.822)/0.349
-#     d = 0.5
-#     for(k in 2:length(quality_vector)) {
-        
-#         v[k] = (sqrt(abs(quality_vector[k]))-0.822)/0.349 
-        
-#         if (dispersion_increase[k - 1] > 5){
-#             dispersion_increase[k] = max(0,(v[k] - d),
-#                                          na.rm = TRUE)
-#         } else {
-#             dispersion_increase[k] = max(0, 
-#                                          (v[k] - d + dispersion_increase[k-1]),
-#                                          na.rm = TRUE) # CuSum variance
-#         }
-#     }
-#     return(dispersion_increase)
-# }
 
 #' Train isolation forest model in parallel
 #' @import parallel
@@ -278,10 +222,8 @@
     cat(paste0("Number of PSMs to process: ", num_psm), 
         sep = "\n", file = "MSstats_anomaly_model_progress.log")
     
-    # browser()
     model_input = unique(input_data[, c(split_column, quality_metrics), with = FALSE])
 
-    # input_data = na.omit(input_data)
     model_results = parallel::parLapply(
         cl, seq_len(num_psm), 
         function(i){

From 4023a70e1e5c080847f4929fda12fc29c489076f Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Wed, 18 Feb 2026 09:25:23 -0500
Subject: [PATCH 06/11] DIA-NN msstats+ converter docs

---
 DESCRIPTION                 |  2 +-
 man/DIANNtoMSstatsFormat.Rd | 43 ++++++++++++++++++++++++++++++-------
 man/MSstatsAnomalyScores.Rd |  6 +++++-
 man/MSstatsClean.Rd         |  4 +++-
 man/dot-cleanRawDIANN.Rd    |  4 +++-
 5 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 81b3bb72..cb89a875 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -14,7 +14,7 @@ License: Artistic-2.0
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 biocViews: MassSpectrometry, Proteomics, Software, DataImport, QualityControl
 Depends:
     R (>= 4.0)
diff --git a/man/DIANNtoMSstatsFormat.Rd b/man/DIANNtoMSstatsFormat.Rd
index 269db0a3..f46b2ad0 100644
--- a/man/DIANNtoMSstatsFormat.Rd
+++ b/man/DIANNtoMSstatsFormat.Rd
@@ -14,12 +14,21 @@ DIANNtoMSstatsFormat(
   removeFewMeasurements = TRUE,
   removeOxidationMpeptides = TRUE,
   removeProtein_with1Feature = TRUE,
+  MBR = TRUE,
+  quantificationColumn = "FragmentQuantCorrected",
+  calculateAnomalyScores = FALSE,
+  anomalyModelFeatures = c(),
+  anomalyModelFeatureTemporal = c(),
+  removeMissingFeatures = 0.5,
+  anomalyModelFeatureCount = 100,
+  runOrder = NULL,
+  n_trees = 100,
+  max_depth = "auto",
+  numberOfCores = 1,
   use_log_file = TRUE,
   append = FALSE,
   verbose = TRUE,
   log_file_path = NULL,
-  MBR = TRUE,
-  quantificationColumn = "FragmentQuantCorrected",
   ...
 )
 }
@@ -51,6 +60,30 @@ Default is 0.01.}
 
 \item{removeProtein_with1Feature}{should proteins with a single feature be removed}
 
+\item{MBR}{True if analysis was done with match between runs}
+
+\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
+Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
+Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.}
+
+\item{calculateAnomalyScores}{Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.}
+
+\item{anomalyModelFeatures}{character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.}
+
+\item{anomalyModelFeatureTemporal}{character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: \code{mean_decrease}, \code{mean_increase}, \code{dispersion_increase}, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).}
+
+\item{removeMissingFeatures}{Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.}
+
+\item{anomalyModelFeatureCount}{Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.}
+
+\item{runOrder}{Temporal order of MS runs. Should be a two column data.table with columns \code{Run} and \code{Order}, where \code{Run} matches the run name output by Spectronaut and \code{Order} is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.}
+
+\item{n_trees}{Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.}
+
+\item{max_depth}{Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.}
+
+\item{numberOfCores}{Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.}
+
 \item{use_log_file}{logical. If TRUE, information about data processing
 will be saved to a file.}
 
@@ -65,12 +98,6 @@ data processing will be saved.
 If not provided, such a file will be created automatically.
 If \code{append = TRUE}, has to be a valid path to a file.}
 
-\item{MBR}{True if analysis was done with match between runs}
-
-\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
-Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
-Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.}
-
 \item{...}{additional parameters to \code{data.table::fread}.}
 }
 \value{
diff --git a/man/MSstatsAnomalyScores.Rd b/man/MSstatsAnomalyScores.Rd
index bc2b18a3..b6a9d500 100644
--- a/man/MSstatsAnomalyScores.Rd
+++ b/man/MSstatsAnomalyScores.Rd
@@ -39,5 +39,9 @@ MSstatsAnomalyScores(
 data.table
 }
 \description{
-Run Anomaly Model
+Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm.
+This function identifies unusual precursor measurements based on quality metrics and their
+temporal patterns. For features with insufficient quality metric data, it assigns anomaly
+scores based on the median score of similar features (same peptide and charge combination).
+The model supports parallel processing for improved performance on large datasets.
 }
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd
index 0e71ab7d..affbd147 100644
--- a/man/MSstatsClean.Rd
+++ b/man/MSstatsClean.Rd
@@ -66,7 +66,9 @@ MSstatsClean(msstats_object, ...)
 \S4method{MSstatsClean}{MSstatsDIANNFiles}(
   msstats_object,
   MBR = TRUE,
-  quantificationColumn = "FragmentQuantCorrected"
+  quantificationColumn = "FragmentQuantCorrected",
+  calculateAnomalyScores = FALSE,
+  anomalyModelFeatures = c()
 )
 
 \S4method{MSstatsClean}{MSstatsMetamorpheusFiles}(msstats_object, MBR = TRUE, qvalue_cutoff = 0.05)
diff --git a/man/dot-cleanRawDIANN.Rd b/man/dot-cleanRawDIANN.Rd
index 118195e7..fa0e1da8 100644
--- a/man/dot-cleanRawDIANN.Rd
+++ b/man/dot-cleanRawDIANN.Rd
@@ -7,7 +7,9 @@
 .cleanRawDIANN(
   msstats_object,
   MBR = TRUE,
-  quantificationColumn = "FragmentQuantCorrected"
+  quantificationColumn = "FragmentQuantCorrected",
+  calculateAnomalyScores = FALSE,
+  anomalyModelFeatures = c()
 )
 }
 \arguments{

From c28b200d5c9b95130f42fa1860c0c495478773e1 Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Wed, 18 Feb 2026 10:17:40 -0500
Subject: [PATCH 07/11] remove missing value split

---
 R/MSstatsConvert_core_functions.R | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index 2cb2a941..b75eeba3 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -555,14 +555,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
         }
     }
 
-    idx_all_missing = input[, 
-        rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
-        .SDcols = quality_metrics]
+    # idx_all_missing = input[, 
+    #     rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
+    #     .SDcols = quality_metrics]
 
-    input_missing_quality = input[idx_all_missing]
-    input_measured_quality = input[!idx_all_missing]
+    # input_missing_quality = input[idx_all_missing]
+    # input_measured_quality = input[!idx_all_missing]
 
-    input_measured_quality = .runAnomalyModel(input_measured_quality, 
+    input = .runAnomalyModel(input, 
                              n_trees=n_trees, 
                              max_depth=max_depth, 
                              cores=cores,
@@ -570,15 +570,15 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
                              quality_metrics=quality_metrics)
     
     # Calculate median anomaly score for each PSM
-    median_scores = input_measured_quality[, 
-        .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
-    input_missing_quality = merge(
-        input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
-    input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
-    input_missing_quality$MedianAnomalyScore = NULL
+    # median_scores = input_measured_quality[, 
+    #     .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
+    # input_missing_quality = merge(
+    #     input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
+    # input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
+    # input_missing_quality$MedianAnomalyScore = NULL
 
     # Combine measured and missing quality data
-    input = rbind(input_measured_quality, input_missing_quality, fill = TRUE)
+    # input = rbind(input_measured_quality, input_missing_quality, fill = TRUE)
 
     subset_cols = c("Run", "ProteinName", "PeptideSequence", 
                     "PrecursorCharge", "FragmentIon", 

From 6a764d05fafeda517a7afbd60b7b43b5ff0d5fa7 Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Wed, 18 Feb 2026 10:41:33 -0500
Subject: [PATCH 08/11] rollback

---
 R/MSstatsConvert_core_functions.R | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index b75eeba3..2cb2a941 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -555,14 +555,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
         }
     }
 
-    # idx_all_missing = input[, 
-    #     rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
-    #     .SDcols = quality_metrics]
+    idx_all_missing = input[, 
+        rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
+        .SDcols = quality_metrics]
 
-    # input_missing_quality = input[idx_all_missing]
-    # input_measured_quality = input[!idx_all_missing]
+    input_missing_quality = input[idx_all_missing]
+    input_measured_quality = input[!idx_all_missing]
 
-    input = .runAnomalyModel(input, 
+    input_measured_quality = .runAnomalyModel(input_measured_quality, 
                              n_trees=n_trees, 
                              max_depth=max_depth, 
                              cores=cores,
@@ -570,15 +570,15 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
                              quality_metrics=quality_metrics)
     
     # Calculate median anomaly score for each PSM
-    # median_scores = input_measured_quality[, 
-    #     .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
-    # input_missing_quality = merge(
-    #     input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
-    # input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
-    # input_missing_quality$MedianAnomalyScore = NULL
+    median_scores = input_measured_quality[, 
+        .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
+    input_missing_quality = merge(
+        input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
+    input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
+    input_missing_quality$MedianAnomalyScore = NULL
 
     # Combine measured and missing quality data
-    # input = rbind(input_measured_quality, input_missing_quality, fill = TRUE)
+    input = rbind(input_measured_quality, input_missing_quality, fill = TRUE)
 
     subset_cols = c("Run", "ProteinName", "PeptideSequence", 
                     "PrecursorCharge", "FragmentIon", 

From 67a50908041d81ec1d1f3df84b504d504f27ab8e Mon Sep 17 00:00:00 2001
From: devonjkohler <devonjkohler@gmail.com>
Date: Sat, 21 Feb 2026 08:06:40 -0500
Subject: [PATCH 09/11] updated anomaly calc

---
 R/MSstatsConvert_core_functions.R | 20 +-------------------
 R/utils_anomaly_score.R           |  8 +++-----
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index 2cb2a941..c5e98c77 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -555,30 +555,12 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
         }
     }
 
-    idx_all_missing = input[, 
-        rowSums(is.na(as.matrix(.SD))) == length(quality_metrics),
-        .SDcols = quality_metrics]
-
-    input_missing_quality = input[idx_all_missing]
-    input_measured_quality = input[!idx_all_missing]
-
-    input_measured_quality = .runAnomalyModel(input_measured_quality, 
+    input = .runAnomalyModel(input, 
                              n_trees=n_trees, 
                              max_depth=max_depth, 
                              cores=cores,
                              split_column="PSM",
                              quality_metrics=quality_metrics)
-    
-    # Calculate median anomaly score for each PSM
-    median_scores = input_measured_quality[, 
-        .(MedianAnomalyScore = median(AnomalyScores, na.rm = TRUE)), by = PSM]
-    input_missing_quality = merge(
-        input_missing_quality, median_scores, by = "PSM", all.x = TRUE)
-    input_missing_quality$AnomalyScores = input_missing_quality$MedianAnomalyScore
-    input_missing_quality$MedianAnomalyScore = NULL
-
-    # Combine measured and missing quality data
-    input = rbind(input_measured_quality, input_missing_quality, fill = TRUE)
 
     subset_cols = c("Run", "ProteinName", "PeptideSequence", 
                     "PrecursorCharge", "FragmentIon", 
diff --git a/R/utils_anomaly_score.R b/R/utils_anomaly_score.R
index ee9badd1..d5981732 100644
--- a/R/utils_anomaly_score.R
+++ b/R/utils_anomaly_score.R
@@ -244,11 +244,9 @@
         }
     )
     
-    model_results = unlist(model_results)
-    # Clip anomaly scores to stop them from exploding
-    model_input$AnomalyScores = pmax(model_results, .001)
-    
-    # browser()
+    model_input$AnomalyScores = unlist(model_results)
+
     input_data = merge(input_data, model_input, by = c(split_column, quality_metrics), all.x = TRUE)
+      
     return(input_data)
 }

From 692546b729a2ccfc9e7f9a6a67a00402b41e841b Mon Sep 17 00:00:00 2001
From: Devon Kohler <35807256+devonjkohler@users.noreply.github.com>
Date: Mon, 2 Mar 2026 10:14:39 -0500
Subject: [PATCH 10/11] Update R/converters_DIANNtoMSstatsFormat.R

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 R/converters_DIANNtoMSstatsFormat.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R
index 806ce95b..82268c8a 100644
--- a/R/converters_DIANNtoMSstatsFormat.R
+++ b/R/converters_DIANNtoMSstatsFormat.R
@@ -28,7 +28,7 @@
 #' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
 #' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
 #' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
-#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
+#' `@param` runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
 #' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.
 #' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.
 #' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.

From 004766b721e7cb363966b73bcc55e139f9c1097d Mon Sep 17 00:00:00 2001
From: Devon Kohler <35807256+devonjkohler@users.noreply.github.com>
Date: Mon, 2 Mar 2026 10:23:12 -0500
Subject: [PATCH 11/11] Fix formatting of runOrder parameter documentation

---
 R/converters_DIANNtoMSstatsFormat.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R
index aa80b2f6..833990f6 100644
--- a/R/converters_DIANNtoMSstatsFormat.R
+++ b/R/converters_DIANNtoMSstatsFormat.R
@@ -27,7 +27,7 @@
 #' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
 #' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
 #' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
-#' `@param` runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
+#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
 #' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.
 #' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.
 #' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.
@@ -145,4 +145,4 @@ DIANNtoMSstatsFormat = function(
     getOption("MSstatsMsg")("INFO", msg_final)
     getOption("MSstatsLog")("INFO", "\n")
     input
-}
\ No newline at end of file
+}