Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 12 additions & 35 deletions DIMS/CollectFilled.R
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
## adapted from 10-collectSamplesFilled.R

# define parameters
cmd_args <- commandArgs(trailingOnly = TRUE)

scripts_dir <- cmd_args[1]
preprocessing_scripts_dir <- cmd_args[1]
ppm <- as.numeric(cmd_args[2])
z_score <- as.numeric(cmd_args[3])

source(paste0(scripts_dir, "merge_duplicate_rows.R"))
source(paste0(scripts_dir, "calculate_zscores.R"))
source(paste0(preprocessing_scripts_dir, "collect_filled_functions.R"))

# for each scan mode, collect all filled peak group lists
scanmodes <- c("positive", "negative")
Expand All @@ -17,7 +14,7 @@ for (scanmode in scanmodes) {
filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
# load files and combine into one object
outlist_total <- NULL
for (file_nr in 1:length(filled_files)) {
for (file_nr in seq_along(filled_files)) {
peakgrouplist_filled <- get(load(filled_files[file_nr]))
outlist_total <- rbind(outlist_total, peakgrouplist_filled)
}
Expand All @@ -30,37 +27,17 @@ for (scanmode in scanmodes) {
repl_pattern <- get(load(pattern_file))
# calculate Z-scores
if (z_score == 1) {
outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE)
nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
outlist_stats_more <- cbind(
outlist_stats[, 1:7],
outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
)
# sort Z-score columns and append to peak group list
tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
outlist_stats_more <- outlist_stats_more[, -tmp_index]
outlist_stats_more <- cbind(outlist_stats_more, tmp)
outlist_total <- outlist_stats_more
outlist_stats <- calculate_zscores_peakgrouplist(outlist_total)
}

# make a copy of the outlist
outlist_ident <- outlist_total
# take care of NAs in theormz_noise
outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
# calculate ppm deviation
outlist_withppm <- calculate_ppmdeviation(outlist_stats)
# put columns in correct order
outlist_ident <- order_columns_peakgrouplist(outlist_withppm)

# Extra output in Excel-readable format:
remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
remove_colindex <- which(colnames(outlist_ident) %in% remove_columns)
outlist_ident <- outlist_ident[, -remove_colindex]
# generate output in Excel-readable format:
remove_columns <- c("mzmin.pgrp", "mzmax.pgrp")
outlist_ident <- outlist_ident[, -which(colnames(outlist_ident) %in% remove_columns)]
write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
# output in RData format
# generate output in RData format
save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData"))
}
145 changes: 145 additions & 0 deletions DIMS/preprocessing/collect_filled_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# CollectFilled functions

collapse <- function(column_label, peakgroup_list, index_dup) {
#' Collapse identification info for peak groups with the same mass
#'
#' @param column_label: Name of column in peakgroup_list (string)
#' @param peakgroup_list: Peak group list (matrix)
#' @param index_dup: Index of duplicate peak group (integer)
#'
#' @return collapsed_items: Semicolon-separated list of info (string)
# get the item(s) that need to be collapsed
list_items <- as.vector(peakgroup_list[index_dup, column_label])
# remove NA
if (length(which(is.na(list_items))) > 0) {
list_items <- list_items[-which(is.na(list_items))]
}
collapsed_items <- paste(list_items, collapse = ";")
return(collapsed_items)
}

merge_duplicate_rows <- function(peakgroup_list) {
#' Merge identification info for peak groups with the same mass
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)

options(digits = 16)
collect <- NULL
remove <- NULL

# check for peak groups with identical mass
index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))

while (length(index_dup) > 0) {
# get the index for the peak group which is double
peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]

# use function collapse to concatenate info
single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA

# keep track of deduplicated entries
collect <- rbind(collect, single_peakgroup)
remove <- c(remove, peaklist_index)

# remove current entry from index
index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
}

# remove duplicate entries
if (!is.null(remove)) {
peakgroup_list <- peakgroup_list[-remove, ]
}
# append deduplicated entries
peakgroup_list_dedup <- rbind(peakgroup_list, collect)
return(peakgroup_list_dedup)
}

calculate_zscores_peakgrouplist <- function(peakgroup_list) {
#' Calculate Z-scores for peak groups based on average and standard deviation of controls
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)

case_label <- "P"
control_label <- "C"
# get index for new column names
startcol <- ncol(peakgroup_list) + 3

# calculate mean and standard deviation for Control group
ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
int_cols <- c(ctrl_cols, case_cols)
# set all zeros to NA
peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))

# set new column names and calculate Z-scores
colnames_zscores <- NULL
for (col_index in int_cols) {
col_name <- colnames(peakgroup_list)[col_index]
colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
peakgroup_list <- cbind(peakgroup_list, zscores_1col)
}

# apply new column names to columns at end plus avg and sd columns
colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores

return(peakgroup_list)
}

calculate_ppm_deviation <- function(peakgroup_list) {
#' Calculate ppm deviation between observed mass and expected theoretical mass
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_list_ppm: peak group list with ppm column (matrix)

# calculate ppm deviation
for (row_index in seq_len(nrow(peakgroup_list))) {
if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
!is.null(peakgroup_list$theormz_HMDB[row_index]) &&
(peakgroup_list$theormz_HMDB[row_index] != "")) {
peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
} else {
peakgroup_list$ppmdev[row_index] <- NA
}
}

return(peakgroup_list)
}

order_columns_peakgrouplist <- function(peakgroup_list) {
#' Put columns in peak group list in correct order
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_ordered: peak group list with columns in correct order (matrix)

original_colnames <- colnames(peakgroup_list)
mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
# if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
control_columns <- grep ("ctrls", original_colnames)
zscore_columns <- grep("_Zscore", original_colnames)
# create peak group list with columns in correct order
peakgroup_ordered <- peakgroup_list[ , c(mass_columns, descriptive_columns, intensity_columns, control_columns, zscore_columns)]

return(peakgroup_ordered)
}

5 changes: 5 additions & 0 deletions DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore" "ppmdev"
"1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 16500 3000 2828.42712474619 9000 13000 90000 130000 0.111112214857712
"2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 22000 4000 2828.42712474619 10000 14000 1e+05 140000 0.473299680976197
"3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 27500 5000 2828.42712474619 11000 15000 110000 150000 0.17426158930175
"4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 33000 6000 2828.42712474619 12000 16000 120000 160000 0.186787674436346
77 changes: 77 additions & 0 deletions DIMS/tests/testthat/test_collect_filled.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# unit tests for CollectFilled
# functions: collapse, merge_duplicate_rows, calculate_zscores_peakgrouplist,
# calculate_ppm_deviation, order_columns_peakgrouplist
source("../../preprocessing/collect_filled_functions.R")

testthat::test_that("Values for duplicate rows are correctly collapsed", {
test_matrix <- matrix(letters[1:8], nrow = 2, ncol = 4)
colnames(test_matrix) <- paste0("column", 1:4)

expect_equal(collapse("column1", test_matrix, c(1,2)), "a;b", TRUE)
})

testthat::test_that("Duplicate rows in a peak group list are correctly merged", {
# Copy/symlink the files to the current location for the function
test_files <- list.files("fixtures/", "test_peakgroup_list", full.names = TRUE)
file.symlink(file.path(test_files), getwd())

# read in peakgroup_list, create duplicate row
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
test_peakgroup_list_dup <- test_peakgroup_list[c(1, 2, 2, 3), ]

# after merging duplicate rows, the test peak group list should have 3 rows
expect_equal(nrow(merge_duplicate_rows(test_peakgroup_list_dup)), 3, TRUE, tolerance = 0.001)
expect_equal(merge_duplicate_rows(test_peakgroup_list_dup)[3, "all_hmdb_ids"],
paste(test_peakgroup_list_dup[2, "all_hmdb_ids"], test_peakgroup_list_dup[3, "all_hmdb_ids"], sep = ";"),
TRUE)
})

testthat::test_that("Z-scores are correctly calculated in CollectFilled", {
# read in peakgroup_list; remove avg.int, std.int, noise and Zscore columns
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
# remove avg.ctrls, sd.ctrls and Z-score columns
test_peakgroup_list_noz <- test_peakgroup_list[ , -grep("avg.ctrls", colnames(test_peakgroup_list))]
test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("sd.ctrls", colnames(test_peakgroup_list_noz))]
test_peakgroup_list_noz <- test_peakgroup_list_noz[ , -grep("_Zscore", colnames(test_peakgroup_list_noz))]

# after calculate_zscores_peakgrouplist, there should be 4 columns with _Zscore in the name
expect_equal(length(grep("_Zscore", colnames(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)))), 4, TRUE, tolerance = 0.001)

# after calculate_zscores_peakgrouplist, the 4 columns with _Zscore in the name should be filled non-zero
expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$C101.1_Zscore[1], -0.7071, TRUE, tolerance = 0.00001)
expect_equal(calculate_zscores_peakgrouplist(test_peakgroup_list_noz)$P2.1_Zscore[4], 12.0208, TRUE, tolerance = 0.00001)

})

testthat::test_that("ppm deviation values are correctly calculated in CollectFilled", {
# read in peakgroup_list
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")

# store ppm deviation values
test_ppm_values <- test_peakgroup_list$ppmdev

# after calculate_ppm_deviation, there ppm values in the new column should approximate the old ones
expect_equal(calculate_ppm_deviation(test_peakgroup_list)$ppmdev, test_ppm_values, TRUE, tolerance = 0.001)

# calculate_ppm_deviation should give NA if there is no value for theormz_HMDB
test_peakgroup_list$theormz_HMDB[1] <- NA
expect_identical(is.na(calculate_ppm_deviation(test_peakgroup_list)$ppmdev[1]), TRUE)

})

testthat::test_that("columns in peak group list are corretly sorted", {
# read in peakgroup_list
test_peakgroup_list <- read.table("./test_peakgroup_list.txt", sep = "\t")
# original order of columns
original_column_order <- colnames(test_peakgroup_list)
# after ordering, column names should be re-ordered
test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]

expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)

# Remove copied/symlinked files
files_remove <- list.files("./", "test_peakgroup_list.txt", full.names = TRUE)
file.remove(files_remove)

})

Loading