Skip to content
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ inst/doc
*.log
*.o
*.so
*.dll
*.dll
.lintr
.vscode
12 changes: 9 additions & 3 deletions R/MSstatsConvert_core_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,12 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
}

#' Run Anomaly Model
#'
#' Detects anomalous measurements in mass spectrometry data using an isolation forest algorithm.
#' This function identifies unusual precursor measurements based on quality metrics and their
#' temporal patterns. For features with insufficient quality metric data, it assigns anomaly
#' scores based on the median score of similar features (same peptide and charge combination).
#' The model supports parallel processing for improved performance on large datasets.
Comment on lines 520 to +526
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new MSstatsAnomalyScores roxygen description claims median-based fallback for insufficient quality metrics and parallel-processing support, but neither behavior is visible in the current implementation of MSstatsAnomalyScores / .prepareSpectronautAnomalyInput / .runAnomalyModel. Please either implement these behaviors or adjust the documentation to match what the function actually does.

Copilot uses AI. Check for mistakes.
#'
#' @param input data.table preprocessed by the MSstatsBalancedDesign function
#' @param quality_metrics character vector of quality metrics to use in the model
Expand All @@ -535,7 +541,7 @@ MSstatsMakeAnnotation = function(input, annotation, ...) {
MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
missing_run_count, n_feat, run_order, n_trees,
max_depth, cores){

input = .prepareSpectronautAnomalyInput(input, quality_metrics,
run_order, n_feat,
missing_run_count)
Expand All @@ -548,14 +554,14 @@ MSstatsAnomalyScores = function(input, quality_metrics, temporal_direction,
temporal_direction[i]))
}
}

input = .runAnomalyModel(input,
n_trees=n_trees,
max_depth=max_depth,
cores=cores,
split_column="PSM",
quality_metrics=quality_metrics)

subset_cols = c("Run", "ProteinName", "PeptideSequence",
"PrecursorCharge", "FragmentIon",
"ProductCharge", "IsotopeLabelType",
Expand Down
30 changes: 21 additions & 9 deletions R/clean_DIANN.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
quantificationColumn = "FragmentQuantCorrected",
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01) {
pg_qvalue_cutoff = 0.01,
calculateAnomalyScores = FALSE,
anomalyModelFeatures = c()) {
dn_input <- getInputFile(msstats_object, "input")
dn_input <- data.table::as.data.table(dn_input)

Expand All @@ -19,7 +21,9 @@
dn_input <- .cleanDIANNAddMissingColumns(dn_input)

# Select required columns
dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR)
dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR,
calculateAnomalyScores,
anomalyModelFeatures)

# Split concatenated values
dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)
Expand Down Expand Up @@ -78,7 +82,9 @@
#' @param MBR logical indicating if match between runs was used
#' @return data.table with selected columns
#' @noRd
.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) {
.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR,
calculateAnomalyScores,
anomalyModelFeatures) {
base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', quantificationColumn, 'QValue',
'PrecursorMz', 'FragmentInfo', 'Run')
Expand All @@ -89,7 +95,13 @@
c('GlobalQValue', 'GlobalPGQValue')
}

req_cols <- c(base_cols, mbr_cols)
qual_cols <- if (calculateAnomalyScores) {
anomalyModelFeatures
} else {
c()
}

req_cols <- c(base_cols, mbr_cols, qual_cols)
return(dn_input[, req_cols, with = FALSE])
}

Expand Down Expand Up @@ -163,13 +175,13 @@
getOption("MSstatsLog")("INFO", msg)
getOption("MSstatsMsg")("INFO", msg)

dn_input = dn_input[QValue < global_qvalue_cutoff, ]
dn_input = dn_input[QValue >= global_qvalue_cutoff, quantificationColumn := 0]
if (MBR) {
msg = '** MBR was used to analyze the data. Now setting names and filtering'
msg_1_mbr = paste0('-- LibPGQValue < ', pg_qvalue_cutoff)
msg_2_mbr = paste0('-- LibQValue < ', qvalue_cutoff)
dn_input = dn_input[LibPGQValue < pg_qvalue_cutoff, ]
dn_input = dn_input[LibQValue < qvalue_cutoff, ]
dn_input = dn_input[LibPGQValue >= pg_qvalue_cutoff, , quantificationColumn := 0]
dn_input = dn_input[LibQValue >= qvalue_cutoff, , quantificationColumn := 0]
getOption("MSstatsLog")("INFO", msg)
getOption("MSstatsMsg")("INFO", msg)
getOption("MSstatsLog")("INFO", msg_1_mbr)
Expand All @@ -181,8 +193,8 @@
msg = '** MBR was not used to analyze the data. Now setting names and filtering'
msg_1 = paste0('-- Filtering on GlobalPGQValue < ', pg_qvalue_cutoff)
msg_2 = paste0('-- Filtering on GlobalQValue < ', qvalue_cutoff)
dn_input = dn_input[GlobalPGQValue < pg_qvalue_cutoff, ]
dn_input = dn_input[GlobalQValue < qvalue_cutoff, ]
dn_input = dn_input[GlobalPGQValue >= pg_qvalue_cutoff, quantificationColumn := 0]
dn_input = dn_input[GlobalQValue >= qvalue_cutoff, quantificationColumn := 0]
getOption("MSstatsLog")("INFO", msg)
getOption("MSstatsMsg")("INFO", msg)
getOption("MSstatsLog")("INFO", msg_1)
Expand Down
69 changes: 49 additions & 20 deletions R/converters_DIANNtoMSstatsFormat.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The anomalyModelFeatureTemporal docs say values can be NULL, but NULL cannot appear as an element in a character vector (it gets dropped). Consider documenting a character sentinel (e.g., "none") or NA_character_ instead, and validate accordingly.

Suggested change
#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or a sentinel indicating no temporal feature engineering (e.g. `none` or `NA_character_`). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all indicate no temporal feature engineering).

Copilot uses AI. Check for mistakes.
#' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
#' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by DIA-NN and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
#' @param n_trees Number of trees to use in isolation forest when calculateAnomalyScores=TRUE. Default is 100.
#' @param max_depth Max tree depth to use in isolation forest when calculateAnomalyScores=TRUE. Default is "auto" which calculates depth as log2(N) where N is the number of runs. Otherwise must be an integer.
#' @param numberOfCores Number of cores for parallel processing anomaly detection model. When > 1, a logfile named 'MSstats_anomaly_model_progress.log' is created to track progress. Only works for Linux & Mac OS. Default is 1.
#' @param ... additional parameters to `data.table::fread`.
#'
#' @return data.frame in the MSstats required format.
Expand Down Expand Up @@ -51,29 +60,39 @@
#' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE,
#' use_log_file = FALSE, quantificationColumn = 'auto')
#' head(output_2_0)
DIANNtoMSstatsFormat = function(input, annotation = NULL,
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01,
useUniquePeptide = TRUE,
removeFewMeasurements = TRUE,
removeOxidationMpeptides = TRUE,
removeProtein_with1Feature = TRUE,
use_log_file = TRUE, append = FALSE,
verbose = TRUE, log_file_path = NULL,
MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected",
...) {
DIANNtoMSstatsFormat = function(
input, annotation = NULL,
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01,
useUniquePeptide = TRUE,
removeFewMeasurements = TRUE,
removeOxidationMpeptides = TRUE,
removeProtein_with1Feature = TRUE,
MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected",
calculateAnomalyScores=FALSE, anomalyModelFeatures=c(),
anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5,
anomalyModelFeatureCount=100,
runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1,
Comment on lines +74 to +77
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add upfront validation for anomaly-model arguments.

When calculateAnomalyScores=TRUE, there is no early check that anomalyModelFeatures is non-empty and aligned with anomalyModelFeatureTemporal. This can fail late in Line 168+ with less actionable errors.

Proposed fix
     anomalyModelFeatures = .standardizeColnames(anomalyModelFeatures)
+    if (calculateAnomalyScores) {
+        if (length(anomalyModelFeatures) == 0) {
+            stop("anomalyModelFeatures must be non-empty when calculateAnomalyScores=TRUE.")
+        }
+
+        allowed_temporal = c("mean_decrease", "mean_increase", "dispersion_increase", NA_character_)
+        if (length(anomalyModelFeatureTemporal) > 0) {
+            if (length(anomalyModelFeatureTemporal) != length(anomalyModelFeatures)) {
+                stop("anomalyModelFeatureTemporal must have the same length as anomalyModelFeatures.")
+            }
+            if (!all(anomalyModelFeatureTemporal %in% allowed_temporal)) {
+                stop("anomalyModelFeatureTemporal values must be one of mean_decrease, mean_increase, dispersion_increase, or NA/NULL.")
+            }
+        }
+    }

Also applies to: 85-86, 167-171

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@R/converters_DIANNtoMSstatsFormat.R` around lines 75 - 78, When
calculateAnomalyScores is TRUE, add upfront validation that anomalyModelFeatures
is non-empty and that length(anomalyModelFeatures) matches
length(anomalyModelFeatureTemporal) (or that anomalyModelFeatureTemporal is
length 1 or same length) before any downstream processing; in practice, inside
the top-level function that accepts args (check the parameter block containing
calculateAnomalyScores, anomalyModelFeatures, anomalyModelFeatureTemporal, e.g.,
the converter function in R/converters_DIANNtoMSstatsFormat.R) add checks that
throw informative errors (stop(...)) if anomalyModelFeatures is NULL/length 0 or
if the two feature vectors are misaligned, and run these validations immediately
after parsing arguments (before any use around lines ~167-171 and ~168) so
failures are early and actionable.

use_log_file = TRUE, append = FALSE,
verbose = TRUE, log_file_path = NULL,
...) {
Comment on lines +63 to +80
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This converter now exposes anomaly-model parameters (calculateAnomalyScores, anomalyModelFeatures, temporal settings, etc.) but doesn’t validate them (unlike SpectronauttoMSstatsFormat, which calls .validateMSstatsConverterParameters). Please add the same validation here to catch cases like calculateAnomalyScores=TRUE with empty/length-mismatched feature vectors early with a clear error.

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot open a new pull request to apply changes based on this feedback

MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
log_file_path)

anomalyModelFeatures = .standardizeColnames(anomalyModelFeatures)

input = MSstatsConvert::MSstatsImport(list(input = input),
"MSstats", "DIANN")

input = MSstatsConvert::MSstatsClean(input, MBR = MBR,
quantificationColumn = quantificationColumn,
global_qvalue_cutoff = global_qvalue_cutoff,
qvalue_cutoff = qvalue_cutoff,
pg_qvalue_cutoff = pg_qvalue_cutoff)
pg_qvalue_cutoff = pg_qvalue_cutoff,
calculateAnomalyScores = calculateAnomalyScores,
anomalyModelFeatures = anomalyModelFeatures)
annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)

decoy_filter = list(col_name = "ProteinName",
Expand All @@ -87,6 +106,7 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,

feature_columns = c("PeptideSequence", "PrecursorCharge",
"FragmentIon", "ProductCharge")
# browser()
input = MSstatsConvert::MSstatsPreprocess(
input,
annotation,
Expand All @@ -101,19 +121,28 @@ DIANNtoMSstatsFormat = function(input, annotation = NULL,
remove_features_with_few_measurements = removeFewMeasurements,
summarize_multiple_psms = max),
columns_to_fill = list(Fraction = 1,
IsotopeLabelType = "Light"))
IsotopeLabelType = "Light"),
anomaly_metrics = anomalyModelFeatures)
input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]

# browser()
input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns,
fill_incomplete = FALSE,
fill_incomplete = TRUE,
handle_fractions = FALSE,
remove_few = removeFewMeasurements
remove_few = removeFewMeasurements,
anomaly_metrics = anomalyModelFeatures
)

# browser()
if (calculateAnomalyScores){
input = MSstatsConvert::MSstatsAnomalyScores(
input, anomalyModelFeatures, anomalyModelFeatureTemporal,
removeMissingFeatures, anomalyModelFeatureCount, runOrder, n_trees,
max_depth, numberOfCores)
}
# browser()
msg_final = paste("** Finished preprocessing. The dataset is ready",
"to be processed by the dataProcess function.")
getOption("MSstatsLog")("INFO", msg_final)
getOption("MSstatsMsg")("INFO", msg_final)
getOption("MSstatsLog")("INFO", "\n")
input
}
}
Loading
Loading