Vitek-Lab · devonjkohler · Mar 2, 2026 · Jan 13, 2026 · Jan 13, 2026 · Feb 9, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ inst/doc
 *.log
 *.o
 *.so
-*.dll
+*.dll
+.lintr
+.vscode
diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
@@ -518,6 +518,12 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
 }
 
 #' Run Anomaly Model
+#'
+#' Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm.
+#' This function identifies unusual precursor measurements based on quality metrics and their 
+#' temporal patterns. For features with insufficient quality metric data, it assigns anomaly 
+#' scores based on the median score of similar features (same peptide and charge combination).
+#' The model supports parallel processing for improved performance on large datasets.
 #' 
 #' @param input data.table preprocessed by the MSstatsBalancedDesign function
 #' @param quality_metrics character vector of quality metrics to use in the model
@@ -535,7 +541,7 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
 MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
                                 missing_run_count, n_feat, run_order, n_trees, 
                                 max_depth, cores){
-    
+
     input = .prepareSpectronautAnomalyInput(input, quality_metrics, 
                                             run_order, n_feat, 
                                             missing_run_count)
@@ -548,14 +554,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
                                        temporal_direction[i]))
         }
     }
-    
+
     input = .runAnomalyModel(input, 
                              n_trees=n_trees, 
                              max_depth=max_depth, 
                              cores=cores,
                              split_column="PSM",
                              quality_metrics=quality_metrics)
-    
+
     subset_cols = c("Run", "ProteinName", "PeptideSequence", 
                     "PrecursorCharge", "FragmentIon", 
                     "ProductCharge", "IsotopeLabelType", 

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
@@ -8,7 +8,9 @@
                            quantificationColumn = "FragmentQuantCorrected",
                            global_qvalue_cutoff = 0.01,
                            qvalue_cutoff = 0.01, 
-                           pg_qvalue_cutoff = 0.01) {
+                           pg_qvalue_cutoff = 0.01,
+                           calculateAnomalyScores = FALSE, 
+                           anomalyModelFeatures = c()) {
     dn_input <- getInputFile(msstats_object, "input")
     dn_input <- data.table::as.data.table(dn_input)
 
@@ -19,7 +21,9 @@
     dn_input <- .cleanDIANNAddMissingColumns(dn_input)
 
     # Select required columns
-    dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR)
+    dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR,
+                                                 calculateAnomalyScores,
+                                                 anomalyModelFeatures)
 
     # Split concatenated values
     dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)
@@ -78,7 +82,9 @@
 #' @param MBR logical indicating if match between runs was used
 #' @return data.table with selected columns
 #' @noRd
-.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) {
+.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR,
+                                             calculateAnomalyScores,
+                                             anomalyModelFeatures) {
     base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', 
                    'PrecursorCharge', quantificationColumn, 'QValue', 
                    'PrecursorMz', 'FragmentInfo', 'Run')
@@ -89,7 +95,13 @@
         c('GlobalQValue', 'GlobalPGQValue')
     }
 
-    req_cols <- c(base_cols, mbr_cols)
+    qual_cols <- if (calculateAnomalyScores) {
+        anomalyModelFeatures
+    } else {
+        c()
+    }
+
+    req_cols <- c(base_cols, mbr_cols, qual_cols)
     return(dn_input[, req_cols, with = FALSE])
 }
 
@@ -163,13 +175,13 @@
     getOption("MSstatsLog")("INFO", msg)
     getOption("MSstatsMsg")("INFO", msg)
 
-    dn_input = dn_input[QValue < global_qvalue_cutoff, ]
+    dn_input = dn_input[QValue >= global_qvalue_cutoff, quantificationColumn := 0]
     if (MBR) {
         msg = '** MBR was used to analyze the data. Now setting names and filtering'
         msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff)
         msg_2_mbr = paste0('-- LibQValue < ', qvalue_cutoff)
-        dn_input = dn_input[LibPGQValue < pg_qvalue_cutoff, ]
-        dn_input = dn_input[LibQValue < qvalue_cutoff, ]
+        dn_input = dn_input[LibPGQValue >= pg_qvalue_cutoff, , quantificationColumn := 0]
+        dn_input = dn_input[LibQValue >= qvalue_cutoff, , quantificationColumn := 0]
         getOption("MSstatsLog")("INFO", msg)
         getOption("MSstatsMsg")("INFO", msg)
         getOption("MSstatsLog")("INFO", msg_1_mbr)
@@ -181,8 +193,8 @@
         msg = '** MBR was not used to analyze the data. Now setting names and filtering'
         msg_1 = paste0('-- Filtering on GlobalPGQValue < ', pg_qvalue_cutoff)
         msg_2 = paste0('-- Filtering on GlobalQValue < ', qvalue_cutoff)
-        dn_input = dn_input[GlobalPGQValue < pg_qvalue_cutoff, ]
-        dn_input = dn_input[GlobalQValue < qvalue_cutoff, ]
+        dn_input = dn_input[GlobalPGQValue >= pg_qvalue_cutoff, quantificationColumn := 0]
+        dn_input = dn_input[GlobalQValue >= qvalue_cutoff, quantificationColumn := 0]
         getOption("MSstatsLog")("INFO", msg)
         getOption("MSstatsMsg")("INFO", msg)
         getOption("MSstatsLog")("INFO", msg_1)

diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R
@@ -22,6 +22,15 @@
 #' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
 #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. 
 #' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
+#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
+#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
+#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
-#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
+#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or a sentinel indicating no temporal feature engineering (e.g. `none` or `NA_character_`). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all indicate no temporal feature engineering).
-#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
+#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or a sentinel indicating no temporal feature engineering (e.g. `none` or `NA_character_`). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all indicate no temporal feature engineering).
+#' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
+#' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
+#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
+#' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.
+#' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.
+#' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.
 #' @param ... additional parameters to `data.table::fread`.
 #'  
 #' @return data.frame in the MSstats required format.
@@ -51,29 +60,39 @@
 #' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE, 
 #'                                 use_log_file = FALSE, quantificationColumn = 'auto')
 #' head(output_2_0)
-DIANNtoMSstatsFormat = function(input, annotation = NULL,
-                                global_qvalue_cutoff = 0.01,
-                                qvalue_cutoff = 0.01, 
-                                pg_qvalue_cutoff = 0.01,
-                                useUniquePeptide = TRUE, 
-                                removeFewMeasurements = TRUE,
-                                removeOxidationMpeptides = TRUE, 
-                                removeProtein_with1Feature = TRUE,
-                                use_log_file = TRUE, append = FALSE, 
-                                verbose = TRUE, log_file_path = NULL,
-                                MBR = TRUE, 
-                                quantificationColumn = "FragmentQuantCorrected",
-                                ...) {
+DIANNtoMSstatsFormat = function(
+    input, annotation = NULL,
+    global_qvalue_cutoff = 0.01,
+    qvalue_cutoff = 0.01, 
+    pg_qvalue_cutoff = 0.01,
+    useUniquePeptide = TRUE, 
+    removeFewMeasurements = TRUE,
+    removeOxidationMpeptides = TRUE, 
+    removeProtein_with1Feature = TRUE,
+    MBR = TRUE, 
+    quantificationColumn = "FragmentQuantCorrected",
+    calculateAnomalyScores=FALSE, anomalyModelFeatures=c(),
+    anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5,
+    anomalyModelFeatureCount=100,
+    runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1, 
+    use_log_file = TRUE, append = FALSE, 
+    verbose = TRUE, log_file_path = NULL,
+    ...) {
     MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, 
                                         log_file_path)
 
+    anomalyModelFeatures = .standardizeColnames(anomalyModelFeatures)
+
     input = MSstatsConvert::MSstatsImport(list(input = input),
                                           "MSstats", "DIANN")
+
     input = MSstatsConvert::MSstatsClean(input, MBR = MBR, 
                                          quantificationColumn = quantificationColumn,
                                          global_qvalue_cutoff = global_qvalue_cutoff,
                                          qvalue_cutoff = qvalue_cutoff, 
-                                         pg_qvalue_cutoff = pg_qvalue_cutoff)
+                                         pg_qvalue_cutoff = pg_qvalue_cutoff,
+                                         calculateAnomalyScores = calculateAnomalyScores, 
+                                         anomalyModelFeatures = anomalyModelFeatures)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
 
     decoy_filter = list(col_name = "ProteinName",
@@ -87,6 +106,7 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
 
     feature_columns = c("PeptideSequence", "PrecursorCharge",
                         "FragmentIon", "ProductCharge")
+    # browser()
     input = MSstatsConvert::MSstatsPreprocess(
         input, 
         annotation, 
@@ -101,19 +121,28 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
             remove_features_with_few_measurements = removeFewMeasurements,
             summarize_multiple_psms = max),
         columns_to_fill = list(Fraction = 1,
-                               IsotopeLabelType = "Light"))
+                               IsotopeLabelType = "Light"),
+        anomaly_metrics = anomalyModelFeatures)
     input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
-
+    # browser()
     input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, 
-                                                  fill_incomplete = FALSE,
+                                                  fill_incomplete = TRUE,
                                                   handle_fractions = FALSE,
-                                                  remove_few = removeFewMeasurements
+                                                  remove_few = removeFewMeasurements,
+                                                  anomaly_metrics = anomalyModelFeatures
     )
-
+    # browser()
+    if (calculateAnomalyScores){
+        input = MSstatsConvert::MSstatsAnomalyScores(
+            input, anomalyModelFeatures, anomalyModelFeatureTemporal,
+            removeMissingFeatures, anomalyModelFeatureCount, runOrder, n_trees, 
+            max_depth, numberOfCores)
+    }
+    # browser()
     msg_final = paste("** Finished preprocessing. The dataset is ready",
                       "to be processed by the dataProcess function.")
     getOption("MSstatsLog")("INFO", msg_final)
     getOption("MSstatsMsg")("INFO", msg_final)
     getOption("MSstatsLog")("INFO", "\n")
     input
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,6 @@ inst/doc @@
     *.log
     *.o
     *.so
-    *.dll
+    *.dll
+    .lintr
+    .vscode