UMCUGenetics · mraves2 · Oct 13, 2025 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
@@ -1,14 +1,11 @@
-## adapted from 10-collectSamplesFilled.R
-
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-scripts_dir <- cmd_args[1]
+preprocessing_scripts_dir <- cmd_args[1]
 ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "merge_duplicate_rows.R"))
-source(paste0(scripts_dir, "calculate_zscores.R"))
+source(paste0(preprocessing_scripts_dir, "collect_filled_functions.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
@@ -17,7 +14,7 @@ for (scanmode in scanmodes) {
   filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
   # load files and combine into one object
   outlist_total <- NULL
-  for (file_nr in 1:length(filled_files)) {
+  for (file_nr in seq_along(filled_files)) {
     peakgrouplist_filled <- get(load(filled_files[file_nr]))
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
@@ -30,37 +27,17 @@ for (scanmode in scanmodes) {
   repl_pattern <- get(load(pattern_file))
   # calculate Z-scores
   if (z_score == 1) {
-    outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE)
-    nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
-    order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
-    outlist_stats_more <- cbind(
-      outlist_stats[, 1:7],
-      outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
-      outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
-      outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
-    )
-    # sort Z-score columns and append to peak group list
-    tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
-    tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
-    tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
-    outlist_stats_more <- outlist_stats_more[, -tmp_index]
-    outlist_stats_more <- cbind(outlist_stats_more, tmp)
-    outlist_total <- outlist_stats_more
+    outlist_stats <- calculate_zscores_peakgrouplist(outlist_total)
   }
-
-  # make a copy of the outlist
-  outlist_ident <- outlist_total
-  # take care of NAs in theormz_noise
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
+  # calculate ppm deviation
+  outlist_withppm <- calculate_ppmdeviation(outlist_stats)
+  #  put columns in correct order
+  outlist_ident <- order_columns_peakgrouplist(outlist_withppm)
 
-  # Extra output in Excel-readable format:
-  remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
-  remove_colindex <- which(colnames(outlist_ident) %in% remove_columns)
-  outlist_ident <- outlist_ident[, -remove_colindex]
+  # generate output in Excel-readable format:
+  remove_columns <- c("mzmin.pgrp", "mzmax.pgrp")
+  outlist_ident <- outlist_ident[, -which(colnames(outlist_ident) %in% remove_columns)]
   write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
-  # output in RData format
+  # generate output in RData format
   save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData"))
 }
diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
@@ -0,0 +1,145 @@
+# CollectFilled functions
+
+collapse <- function(column_label, peakgroup_list, index_dup) {
+  #' Collapse identification info for peak groups with the same mass
+  #'
+  #' @param column_label: Name of column in peakgroup_list (string)
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param index_dup: Index of duplicate peak group (integer)
+  #'
+  #' @return collapsed_items: Semicolon-separated list of info (string)
+  # get the item(s) that need to be collapsed
+  list_items <- as.vector(peakgroup_list[index_dup, column_label])
+  # remove NA
+  if (length(which(is.na(list_items))) > 0) {
+    list_items <- list_items[-which(is.na(list_items))]
+  }
+  collapsed_items <- paste(list_items, collapse = ";")
+  return(collapsed_items)
+}
+
+merge_duplicate_rows <- function(peakgroup_list) {
+  #' Merge identification info for peak groups with the same mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  options(digits = 16)
+  collect <- NULL
+  remove <- NULL
+
+  # check for peak groups with identical mass
+  index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))
+
+  while (length(index_dup) > 0) {
+    # get the index for the peak group which is double
+    peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
+    single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]
+
+    # use function collapse to concatenate info
+    single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
+    single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
+    single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
+    if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA
+
+    # keep track of deduplicated entries
+    collect <- rbind(collect, single_peakgroup)
+    remove <- c(remove, peaklist_index)
+
+    # remove current entry from index
+    index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
+  }
+
+  # remove duplicate entries
+  if (!is.null(remove)) {
+    peakgroup_list <- peakgroup_list[-remove, ]
+  }
+  # append deduplicated entries
+  peakgroup_list_dedup <- rbind(peakgroup_list, collect)
+  return(peakgroup_list_dedup)
+}
+
+calculate_zscores_peakgrouplist <- function(peakgroup_list) {
+  #' Calculate Z-scores for peak groups based on average and standard deviation of controls
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  case_label <- "P"
+  control_label <- "C"
+  # get index for new column names
+  startcol <- ncol(peakgroup_list) + 3
+
+  # calculate mean and standard deviation for Control group
+  ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
+  case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
+  int_cols <- c(ctrl_cols, case_cols)
+  # set all zeros to NA
+  peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
+  ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
+  peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
+  peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+
+  # set new column names and calculate Z-scores
+  colnames_zscores <- NULL
+  for (col_index in int_cols) {
+    col_name <- colnames(peakgroup_list)[col_index]
+    colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
+    zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
+                       peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
+    peakgroup_list <- cbind(peakgroup_list, zscores_1col)
+  }
+
+  # apply new column names to columns at end plus avg and sd columns
+  colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
+
+  return(peakgroup_list)
+}
+
+calculate_ppm_deviation <- function(peakgroup_list) {
+  #' Calculate ppm deviation between observed mass and expected theoretical mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_ppm: peak group list with ppm column (matrix)
+
+  # calculate ppm deviation
+  for (row_index in seq_len(nrow(peakgroup_list))) {
+    if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
+        !is.null(peakgroup_list$theormz_HMDB[row_index]) &&
+        (peakgroup_list$theormz_HMDB[row_index] != "")) {
+      peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
+    } else {
+      peakgroup_list$ppmdev[row_index] <- NA
+    }
+  }
+
+  return(peakgroup_list)
+}
+
+order_columns_peakgrouplist <- function(peakgroup_list) {
+  #' Put columns in peak group list in correct order
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_ordered: peak group list with columns in correct order (matrix)
+
+  original_colnames <- colnames(peakgroup_list)
+  mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
+  descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
+  intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
+  # if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
+  control_columns <- grep ("ctrls", original_colnames)
+  zscore_columns <- grep("_Zscore", original_colnames)
+  # create peak group list with columns in correct order
+  peakgroup_ordered <- peakgroup_list[ , c(mass_columns, descriptive_columns, intensity_columns, control_columns, zscore_columns)]
+
+  return(peakgroup_ordered)
+}
+
diff --git a/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt b/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
@@ -0,0 +1,5 @@
+"mzmed.pgrp"	"nrsamples"	"C101.1"	"C102.1"	"P2.1"	"P3.1"	"assi_HMDB"	"all_hmdb_names"	"iso_HMDB"	"HMDB_code"	"all_hmdb_ids"	"sec_hmdb_ids"	"theormz_HMDB"	"avg.int"	"avg.ctrls"	"sd.ctrls"	"C101.1_Zscore"	"C102.1_Zscore"	"P2.1_Zscore"	"P3.1_Zscore"	"ppmdev"
+"1"	300.199680958642	0.451108327135444	1000	5000	10000	50000	"A"	"A;X"	NA	"HMDB1234567"	"HMDB1234567;HMDB1234567"	NA	300.1996476	16500	3000	2828.42712474619	9000	13000	90000	130000	0.111112214857712
+"2"	300.000315890415	0.498603057814762	2000	6000	20000	60000	"B"	"B;Y"	NA	"HMDB1234567_1"	"HMDB1234567_1;HMDB1234567_1"	NA	300.00017417	22000	4000	2828.42712474619	10000	14000	1e+05	140000	0.473299680976197
+"3"	300.254185894039	0.589562055887654	3000	7000	30000	70000	"C"	"C;Z"	NA	"HMDB1234567_2"	"HMDB1234567_2;HMDB1234567_2"	NA	300.25413357	27500	5000	2828.42712474619	11000	15000	110000	150000	0.17426158930175
+"4"	300.755745105678	0.277923040557653	4000	8000	40000	80000	"D"	"D;V"	NA	"HMDB1234567_7"	"HMDB1234567_7;HMDB1234567_7"	NA	300.75568892	33000	6000	2828.42712474619	12000	16000	120000	160000	0.186787674436346
diff --git a/DIMS/tests/testthat/test_collect_filled.R b/DIMS/tests/testthat/test_collect_filled.R
@@ -0,0 +1,77 @@
+# unit tests for CollectFilled
+# functions: collapse, merge_duplicate_rows, calculate_zscores_peakgrouplist,
+#            calculate_ppm_deviation, order_columns_peakgrouplist
+source("../../preprocessing/collect_filled_functions.R")
+
+testthat::test_that("Values for duplicate rows are correctly collapsed", {
+  test_matrix <- matrix(letters[1:8], nrow = 2, ncol = 4)
+  colnames(test_matrix) <- paste0("column", 1:4)
+
+  expect_equal(collapse("column1", test_matrix, c(1,2)), "a;b", TRUE)
+})
+
+testthat::test_that("Duplicate rows in a peak group list are correctly merged", {
+  # Copy/symlink the files to the current location for the function
+  test_files <- list.files("fixtures/", "test_peakgroup_list", full.names = TRUE)
+  file.symlink(file.path(test_files), getwd())
+
+  # read in peakgroup_list, create duplicate row 
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+  test_peakgroup_list_dup <- test_peakgroup_list[c(1, 2, 2, 3), ]
+
+  # after merging duplicate rows, the test peak group list should have 3 rows
+  expect_equal(nrow(merge_duplicate_rows(test_peakgroup_list_dup)), 3, TRUE, tolerance = 0.001)
+  expect_equal(merge_duplicate_rows(test_peakgroup_list_dup)[3, "all_hmdb_ids"], 
+               paste(test_peakgroup_list_dup[2, "all_hmdb_ids"], test_peakgroup_list_dup[3, "all_hmdb_ids"], sep = ";"), 
+               TRUE)
+})
+
+testthat::test_that("Z-scores are correctly calculated in CollectFilled", {
+  # read in peakgroup_list; remove avg.int, std.int, noise and Zscore columns
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+  # remove avg.ctrls, sd.ctrls and Z-score columns
+  test_peakgroup_list_noz <- test_peakgroup_list[ , -grep("avg.ctrls", colnames(test_peakgroup_list))]
+  test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("sd.ctrls", colnames(test_peakgroup_list_noz))]
+  test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("_Zscore", colnames(test_peakgroup_list_noz))]
+
+  # after calculate_zscores_peakgrouplist, there should be 4 columns with _Zscore in the name
+  expect_equal(length(grep("_Zscore", colnames(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)))), 4, TRUE, tolerance = 0.001)
+
+  # after calculate_zscores_peakgrouplist, the 4 columns with _Zscore in the name should be filled non-zero
+  expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$C101.1_Zscore[1], -0.7071, TRUE, tolerance = 0.00001)
+  expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$P2.1_Zscore[4], 12.0208, TRUE, tolerance = 0.00001)
+
+})
+
+testthat::test_that("ppm deviation values are correctly calculated in CollectFilled", {
+  # read in peakgroup_list
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+
+  # store ppm deviation values
+  test_ppm_values <- test_peakgroup_list$ppmdev
+
+  # after calculate_ppm_deviation, there ppm values in the new column should approximate the old ones
+  expect_equal(calculate_ppm_deviation(test_peakgroup_list)$ppmdev, test_ppm_values, TRUE, tolerance = 0.001)
+
+  # calculate_ppm_deviation should give NA if there is no value for theormz_HMDB
+  test_peakgroup_list$theormz_HMDB[1] <- NA
+  expect_identical(is.na(calculate_ppm_deviation(test_peakgroup_list)$ppmdev[1]), TRUE)
+
+})
+
+testthat::test_that("columns in peak group list are corretly sorted", {
+  # read in peakgroup_list
+  test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
+  # original order of columns
+  original_column_order <- colnames(test_peakgroup_list)
+  # after ordering, column names should be re-ordered
+  test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]
+
+  expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)
+
+  # Remove copied/symlinked files
+  files_remove <- list.files("./", "test_peakgroup_list.txt", full.names = TRUE)
+  file.remove(files_remove)
+
+})
+