From e16fc5b59c4a3bba1bd7e0e76901a820fa1d4d0f Mon Sep 17 00:00:00 2001 From: Sido Haakma Date: Tue, 22 Jun 2021 10:21:18 +0200 Subject: [PATCH 1/5] feat: can work with long format as well --- R/enums.R | 7 + R/reshape.R | 68 +++- R/{reshape_helpers.R => reshape_long.R} | 10 +- R/reshape_wide.R | 480 ++++++++++++++++++++++++ R/upload.R | 7 +- 5 files changed, 548 insertions(+), 24 deletions(-) rename R/{reshape_helpers.R => reshape_long.R} (97%) create mode 100644 R/reshape_wide.R diff --git a/R/enums.R b/R/enums.R index b4e6adf..752441c 100644 --- a/R/enums.R +++ b/R/enums.R @@ -38,6 +38,13 @@ du.enum.dict.kind <- function() { list(CORE = "core", OUTCOME = "outcome", BETA = "beta") } +#' Source data types +#' +#' @noRd +du.enum.data.type <- function() { + list(LONG = "long", WIDE = "wide") +} + #' Projects that are containing dictionaries. Repositories containing these dictionaries should be: #' #' - ds-dictionaries diff --git a/R/reshape.R b/R/reshape.R index 4a0615d..1659c7b 100755 --- a/R/reshape.R +++ b/R/reshape.R @@ -2,18 +2,20 @@ #' #' @param upload do you want automatically upload the files to your opal (default = true) #' @param data_version version of the data you are going to upload into Opal +#' @param data_type default = WIDE, can be LONG as well #' @param input_format possible formats are CSV,STATA,SPSS or SAS (default = CSV) #' @param dict_version version of the dictionary #' @param dict_kind kind of data to reshape (default = core) #' @param input_path path for import file #' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE + #' #' @importFrom readxl read_xlsx #' #' @noRd -du.reshape <- function(upload = TRUE, project, data_version, input_format, dict_version, dict_kind, input_path, run_mode) { +du.reshape <- function(upload = TRUE, project, data_version, data_type, input_format, dict_version, dict_kind, input_path, run_mode) { message("######################################################") - message(" Start converting and uploading data ") + message(paste0(" Start converting and uploading data ( ", data_type, "-format )")) message("######################################################") message("* Setup: load data and set output directory") message("------------------------------------------------------") @@ -28,24 +30,50 @@ du.reshape <- function(upload = TRUE, project, data_version, input_format, dict_ file_name_monthly <- paste0(file_prefix, "_", data_version, "_", "monthly_repeated_measures") file_name_yearly <- paste0(file_prefix, "_", data_version, "_", "yearly_repeated_measures") - nonrep_data <- du.reshape.generate.non.repeated( - data, dict_kind - ) + if(data_type == du.enum.data.type()$WIDE) { + nonrep_data <- du.reshape.wide.generate.non.repeated( + data, dict_kind + ) + } else { + nonrep_data <- du.reshape.long.generate.non.repeated( + data, dict_kind + ) + } if (!is.null(nonrep_data)) write_csv(nonrep_data, paste0(getwd(), "/", file_name_nonrep, ".csv"), na = "") - yearlyrep_data <- du.reshape.generate.yearly.repeated( - data, dict_kind - ) + + if(data_type == du.enum.data.type()$WIDE) { + yearlyrep_data <- du.reshape.wide.generate.yearly.repeated( + data, dict_kind + ) + } else { + #yearlyrep_data <- du.reshape.long.generate.yearly.repeated( + # data, dict_kind + #) + } if (!is.null(yearlyrep_data)) write_csv(yearlyrep_data, paste0(getwd(), "/", file_name_yearly, ".csv"), na = "") - monthlyrep_data <- du.reshape.generate.monthly.repeated( - data, dict_kind - ) + + if(data_type == du.enum.data.type()$WIDE) { + monthlyrep_data <- du.reshape.wide.generate.monthly.repeated( + data, dict_kind + ) + } else { + #monthlyrep_data <- du.reshape.long.generate.monthly.repeated( + # data, dict_kind + #) + } if (!is.null(monthlyrep_data)) write_csv(monthlyrep_data, paste0(getwd(), "/", file_name_monthly, ".csv"), na = "") if (dict_kind == du.enum.dict.kind()$OUTCOME) { file_name_weekly <- paste0(file_prefix, "_", data_version, "_", "weekly_repeated_measures") - weeklyrep_data <- du.reshape.generate.weekly.repeated( - data, dict_kind - ) + if(data_type == du.enum.data.type()$WIDE) { + weeklyrep_data <- du.reshape.wide.generate.weekly.repeated( + data, dict_kind + ) + } else { + #weeklyrep_data <- du.reshape.long.generate.weekly.repeated( + # data, dict_kind + #) + } if (!is.null(weeklyrep_data)) { write_csv(weeklyrep_data, paste0(getwd(), "/", file_name_weekly, ".csv"), na = "") weeklyrep_metadata <- du.retrieve.full.dict(du.enum.table.types()$WEEKLY, dict_kind) @@ -63,9 +91,15 @@ du.reshape <- function(upload = TRUE, project, data_version, input_format, dict_ if (dict_kind == du.enum.dict.kind()$CORE & dict_version != "1_0") { file_name_trimester <- paste0(file_prefix, "_", data_version, "_", "trimester_repeated_measures") - trimester_data <- du.reshape.generate.trimesterly.repeated( - data, dict_kind - ) + if(data_type == du.enum.data.type()$WIDE) { + trimester_data <- du.reshape.wide.generate.trimesterly.repeated( + data, dict_kind + ) + } else { + #trimester_data <- du.reshape.long.generate.trimesterly.repeated( + # data, dict_kind + #) + } if (!is.null(trimester_data)) { write_csv(trimester_data, paste0(getwd(), "/", file_name_trimester, ".csv"), na = "") trimester_metadata <- du.retrieve.full.dict(du.enum.table.types()$TRIMESTER, dict_kind) diff --git a/R/reshape_helpers.R b/R/reshape_long.R similarity index 97% rename from R/reshape_helpers.R rename to R/reshape_long.R index a58b18a..310820d 100644 --- a/R/reshape_helpers.R +++ b/R/reshape_long.R @@ -156,7 +156,7 @@ du.check.nas <- function(stripped, raw) { #' @importFrom readxl read_xlsx #' #' @noRd -du.reshape.generate.non.repeated <- function(data, dict_kind) { +du.reshape.long.generate.non.repeated <- function(data, dict_kind) { message("* Generating: non-repeated measures") # Retrieve dictionary @@ -192,7 +192,7 @@ du.reshape.generate.non.repeated <- function(data, dict_kind) { #' @importFrom tidyr gather #' #' @noRd -du.reshape.generate.yearly.repeated <- function(data, dict_kind) { +du.reshape.long.generate.yearly.repeated <- function(data, dict_kind) { # workaround to avoid glpobal variable warnings, check: # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when orig_var <- value <- age_years <- . <- NULL @@ -264,7 +264,7 @@ du.reshape.generate.yearly.repeated <- function(data, dict_kind) { #' @importFrom tidyr gather #' #' @noRd -du.reshape.generate.monthly.repeated <- function(data, dict_kind) { +du.reshape.long.generate.monthly.repeated <- function(data, dict_kind) { # workaround to avoid glpobal variable warnings, check: # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when orig_var <- value <- age_months <- . <- NULL @@ -338,7 +338,7 @@ du.reshape.generate.monthly.repeated <- function(data, dict_kind) { #' @importFrom tidyr gather #' #' @noRd -du.reshape.generate.weekly.repeated <- function(data, dict_kind) { +du.reshape.long.generate.weekly.repeated <- function(data, dict_kind) { # workaround to avoid glpobal variable warnings, check: # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when orig_var <- value <- age_weeks <- . <- NULL # Gestational age in weeks @@ -414,7 +414,7 @@ du.reshape.generate.weekly.repeated <- function(data, dict_kind) { #' @importFrom tidyr gather #' #' @noRd -du.reshape.generate.trimesterly.repeated <- function(data, dict_kind) { +du.reshape.long.generate.trimesterly.repeated <- function(data, dict_kind) { # workaround to avoid glpobal variable warnings, check: # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when orig_var <- value <- age_trimester <- . <- NULL diff --git a/R/reshape_wide.R b/R/reshape_wide.R new file mode 100644 index 0000000..c2b03f3 --- /dev/null +++ b/R/reshape_wide.R @@ -0,0 +1,480 @@ +#' Read the input file from different sources +#' +#' @param input_format possible formats are CSV,STATA,SPSS or SAS (default = CSV) +#' @param input_path path for importfile +#' +#' @importFrom readr read_csv cols col_double +#' @importFrom haven read_dta read_sas read_spss +#' +#' @return dataframe with source data +#' +#' @noRd +du.read.source.file <- function(input_path, input_format) { + du_data <- NULL + + if (input_format %in% du.enum.input.format()) { + if (input_format == du.enum.input.format()$STATA) { + data <- read_dta(input_path) + } else if (input_format == du.enum.input.format()$SPSS) { + data <- read_spss(input_path) + } else if (input_format == du.enum.input.format()$SAS) { + data <- read_sas(input_path) + } else if (input_format == du.enum.input.format()$R) { + data <- source(input_path) + } else { + data <- read_csv(input_path, col_types = cols(.default = col_double())) + } + } else { + stop(paste0( + input_format, " is not a valid input format, Possible input formats are: ", + paste(du.enum.input.format(), collapse = ", ") + )) + } + + return(data) +} + +#' Get the table without rows containing only NA's. +#' +#' We have to remove the first column (child_id), that is generated always. +#' +#' @param dataframe dataframe to check +#' +#' @importFrom dplyr %>% +#' +#' @return dataframe without the na values +#' +#' @noRd +du.data.frame.remove.all.na.rows <- function(dataframe) { + df <- dataframe[-c(1)] + + naLines <- df %>% + is.na() %>% + apply(MARGIN = 1, FUN = all) + + return(df[!naLines, ]) +} +#' +#' Matched the columns in the source data. +#' You can then match the found column against the dictionary. +#' +#' @param data_columns columns obtained from raw data +#' @param dict_columns columns matched in the dictionary +#' +#' @importFrom stringr str_subset +#' +#' @return matched_columns in source data +#' +#' @noRd +du.match.columns <- function(data_columns, dict_columns) { + matched_columns <- character() + + matched_columns <- data_columns[data_columns %in% dict_columns] + + for (variable in dict_columns) { + matched_columns <- c(matched_columns, data_columns %>% str_subset(pattern = paste0("^", + variable, "\\d+", + sep = "" + ))) + } + # Select the non-repeated measures from the full data set + return(matched_columns) +} + +#' +#' Check if there are columns not matching the dictionary. +#' +#' @param dict_kind specify which dictionary you want to check +#' @param data_columns the coiumns within the data +#' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE +#' +#' @return stops the program if someone terminates +#' +#' @noRd +du.check.variables <- function(dict_kind, data_columns, run_mode) { + variables <- du.retrieve.dictionaries(dict_kind = dict_kind) + + matched_columns <- du.match.columns(data_columns, variables$name) + + columns_not_matched <- data_columns[!(data_columns %in% matched_columns)] + + if (length(columns_not_matched) > 0) { + message(paste0("[WARNING] This is an unmatched column, it will be dropped : [ ", columns_not_matched, " ].", sep = '\n')) + if (run_mode != du.enum.run.mode()$NON_INTERACTIVE) { + proceed <- readline("Do you want to proceed (y/n)") + } else { + proceed <- "y" + } + } else { + proceed <- "y" + } + if (proceed == "n") { + message(paste0(columns_not_matched, sep = '\n')) + stop("Program is terminated. There are unmatched columns in your source data.") + } +} + +#' Check for NA columns +#' +#' @param stripped variables without NA values +#' @param raw original variables +#' @param run_mode the run mode of the package +#' +#' @return stops the program if someone terminates +#' +#' @noRd +du.check.nas <- function(stripped, raw) { + + # remove child_id + raw <- raw[-1] + + variables_na <- raw[!(raw %in% stripped)] + + if (length(variables_na) > 0) { + message(paste0("[WARNING] Variable dropped because completely missing: [ ", variables_na, " ]", sep = '\n')) + if (ds_upload.globals$run_mode != du.enum.run.mode()$NON_INTERACTIVE) { + proceed <- readline("Do you want to proceed (y/n)") + } else { + proceed <- "y" + } + } else { + proceed <- "y" + } + if (proceed == "n") { + message(paste0(variables_na, sep = '\n')) + stop("Program is terminated. There are columns in your source data that are completely missing.") + } +} + +#' Generate the yearly repeated measures file and write it to your local workspace +#' +#' @param data data frame with all the data based upon the CSV file +#' @param dict_kind can be 'core' or 'outcome' +#' +#' @importFrom readr write_csv +#' @importFrom dplyr %>% +#' @importFrom readxl read_xlsx +#' +#' @noRd +du.reshape.wide.generate.non.repeated <- function(data, dict_kind) { + message("* Generating: non-repeated measures") + + # Retrieve dictionary + variables_non_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$NONREP, dict_kind) + + # select the non-repeated measures from the full data set + non_repeated <- c("child_id", variables_non_repeated_dict$name) + non_repeated_measures <- data[, which(colnames(data) %in% non_repeated)] + + # strip the rows with na values + stripped_non_repeated_measures <- non_repeated_measures[, colSums(is.na(non_repeated_measures)) < + nrow(non_repeated_measures)] + + du.check.nas(colnames(stripped_non_repeated_measures), colnames(non_repeated_measures)) + + # add row_id again to preserve child_id + stripped_non_repeated_measures <- data.frame( + row_id = c(1:length(non_repeated_measures$child_id)), + non_repeated_measures + ) + + return(as.data.frame(stripped_non_repeated_measures)) +} + +#' Generate the yearly repeated measures file and write it to your local workspace +#' +#' @param data data frame with all the data based upon the CSV file +#' @param dict_kind can be 'core' or 'outcome' +#' +#' @importFrom readr write_csv +#' @importFrom dplyr %>% filter summarise bind_rows +#' @importFrom maditr dcast as.data.table %<>% +#' @importFrom tidyr gather +#' +#' @noRd +du.reshape.wide.generate.yearly.repeated <- function(data, dict_kind) { + # workaround to avoid glpobal variable warnings, check: + # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when + orig_var <- value <- age_years <- . <- NULL + + message("* Generating: yearly-repeated measures") + + variables_yearly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$YEARLY, dict_kind) + matched_columns <- du.match.columns(colnames(data), variables_yearly_repeated_dict$name) + yearly_repeated_measures <- data[matched_columns] + + if (nrow(du.data.frame.remove.all.na.rows(yearly_repeated_measures)) <= 0) { + message("[WARNING] No yearly-repeated measures found in this set") + return() + } + + long_1 <- yearly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns != + "child_id"], na.rm = TRUE) + + # Create the age_years variable with the regular expression extraction of the year + long_1$age_years <- as.numeric(du.num.extract(long_1$orig_var)) + + # Here we remove the year indicator from the original variable name + long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var) + + raw <- unique(gsub("[[:digit:]]+$", "", colnames(yearly_repeated_measures))) + du.check.nas(unique(long_1$variable_trunc), raw) + + # Use the maditr package for spreading the data again, as tidyverse runs into memory + # issues + long_2 <- dcast(long_1, child_id + age_years ~ variable_trunc, value.var = "value") + + # As the data table is still too big for opal, remove those rows, that have only + # missing values, but keep all rows at age_years=0, so no child_id get's lost: + + # Subset of data with age_years = 0 + zero_year <- long_2 %>% filter(age_years %in% 0) + + for (id in unique(yearly_repeated_measures$child_id)) { + if (!(id %in% zero_year$child_id)) { + zero_year %<>% summarise(child_id = id, age_years = 0) %>% bind_rows( + zero_year, + ) + } + } + + # Subset of data with age_years > 0 + later_year <- long_2 %>% filter(age_years > 0) + + # Bind the 0 year and older data sets together + long_2 <- rbind(zero_year, later_year) + + # Create a row_id so there is a unique identifier for the rows + long_2$row_id <- c(1:length(long_2$child_id)) + + # Arrange the variable names based on the original order + long_yearly <- long_2[, c("row_id", "child_id", "age_years", unique(long_1$variable_trunc))] + + return(as.data.frame(long_yearly)) +} + +#' Generate the monthly repeated measures file and write it to your local workspace +#' +#' @param data data frame with all the data based upon the CSV file +#' @param dict_kind can be 'core' or 'outcome' +#' +#' @importFrom readr write_csv +#' @importFrom dplyr %>% filter summarise bind_rows +#' @importFrom maditr dcast as.data.table %<>% +#' @importFrom tidyr gather +#' +#' @noRd +du.reshape.wide.generate.monthly.repeated <- function(data, dict_kind) { + # workaround to avoid glpobal variable warnings, check: + # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when + orig_var <- value <- age_months <- . <- NULL + + message("* Generating: monthly-repeated measures") + + variables_monthly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$MONTHLY, dict_kind) + matched_columns <- du.match.columns(colnames(data), variables_monthly_repeated_dict$name) + monthly_repeated_measures <- data[, matched_columns] + + if (nrow(du.data.frame.remove.all.na.rows(monthly_repeated_measures)) <= 0) { + message("[WARNING] No monthly-repeated measures found in this set") + return() + } + + long_1 <- monthly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns != + "child_id"], na.rm = TRUE) + + # Create the age_years and age_months variables with the regular expression + # extraction of the year + long_1$age_years <- as.integer(as.numeric(du.num.extract(long_1$orig_var)) / 12) + long_1$age_months <- as.numeric(du.num.extract(long_1$orig_var)) + + # Here we remove the year indicator from the original variable name + long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var) + + raw <- unique(gsub("[[:digit:]]+$", "", colnames(monthly_repeated_measures))) + du.check.nas(unique(long_1$variable_trunc), raw) + + # Use the maditr package for spreading the data again, as tidyverse ruins into memory + # issues + long_2 <- dcast(long_1, child_id + age_years + age_months ~ variable_trunc, value.var = "value") + + # As the data table is still too big for opal, remove those rows, that have only + # missing values, but keep all rows at age_years=0, so no child_id get's lost: + + # Subset of data with age_months = 0 + zero_monthly <- long_2 %>% filter(age_months %in% 0) + + for (id in unique(monthly_repeated_measures$child_id)) { + if (!(id %in% zero_monthly$child_id)) { + zero_monthly %<>% summarise(child_id = id, age_months = 0) %>% bind_rows( + zero_monthly, + ) + } + } + + # Subset of data with age_months > 0 + later_monthly <- long_2 %>% filter(age_months > 0) + + # Bind the 0 year and older data sets together + long_2 <- rbind(zero_monthly, later_monthly) + + # Create a row_id so there is a unique identifier for the rows + long_2$row_id <- c(1:length(long_2$child_id)) + + # Arrange the variable names based on the original order + long_monthly <- long_2[, c("row_id", "child_id", "age_years", "age_months", unique(long_1$variable_trunc))] + + return(as.data.frame(long_monthly)) +} + +#' Generate the weekly repeated measures file and write it to your local workspace +#' +#' @param data data frame with all the data based upon the CSV file +#' @param dict_kind can be 'core' or 'outcome' +#' +#' @importFrom readr write_csv +#' @importFrom dplyr %>% filter summarise bind_rows +#' @importFrom maditr dcast as.data.table %<>% +#' @importFrom tidyr gather +#' +#' @noRd +du.reshape.wide.generate.weekly.repeated <- function(data, dict_kind) { + # workaround to avoid glpobal variable warnings, check: + # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when + orig_var <- value <- age_weeks <- . <- NULL # Gestational age in weeks + + message("* Generating: weekly-repeated measures") + + variables_weekly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$WEEKLY, dict_kind) + matched_columns <- du.match.columns(colnames(data), variables_weekly_repeated_dict$name) + weekly_repeated_measures <- data[, matched_columns] + + if (nrow(du.data.frame.remove.all.na.rows(weekly_repeated_measures)) <= 0) { + message("[WARNING] No weekly-repeated measures found in this set") + return() + } + + long_1 <- weekly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns != + "child_id"], na.rm = TRUE) + + # Create the age_years and age_months variables with the regular expression + # extraction of the year NB - these weekly dta are pregnancy related so child is NOT + # BORN YET --- + long_1$age_years <- as.integer(as.numeric(du.num.extract(long_1$orig_var)) / 52) + long_1$age_weeks <- as.integer(du.num.extract(long_1$orig_var)) + + # Here we remove the year indicator from the original variable name + long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var) + + raw <- unique(gsub("[[:digit:]]+$", "", colnames(weekly_repeated_measures))) + du.check.nas(unique(long_1$variable_trunc), raw) + + # Use the maditr package for spreading the data again, as tidyverse ruins into memory + # issues + long_2 <- dcast(long_1, child_id + age_years + age_weeks ~ variable_trunc, value.var = "value") + + # As the data table is still too big for opal, remove those rows, that have only + # missing values, but keep all rows at age_years=0, so no child_id get's lost: + + # Subset of data with age_months = 0 + zero_weekly <- long_2 %>% filter(age_weeks %in% 0) + + for (id in unique(weekly_repeated_measures$child_id)) { + if (!(id %in% zero_weekly$child_id)) { + zero_weekly %<>% summarise(child_id = id, age_weeks = 0) %>% bind_rows( + zero_weekly, + ) + } + } + + # Subset of data with age_months > 0 + later_weekly <- long_2 %>% filter(age_weeks > 0) + + # Bind the 0 year and older data sets together + long_2 <- rbind(zero_weekly, later_weekly) + + # Create a row_id so there is a unique identifier for the rows + long_2$row_id <- c(1:length(long_2$child_id)) + + # Arrange the variable names based on the original order + long_weekly <- long_2[, c("row_id", "child_id", "age_years", "age_weeks", unique(long_1$variable_trunc))] + + return(as.data.frame(long_weekly)) +} + + +#' Generate the trimesterly repeated measures file and write it to your local workspace +#' +#' @param data data frame with all the data based upon the CSV file +#' @param dict_kind can be 'core' or 'outcome' +#' +#' @importFrom readr write_csv +#' @importFrom dplyr %>% filter summarise bind_rows +#' @importFrom maditr dcast as.data.table %<>% +#' @importFrom tidyr gather +#' +#' @noRd +du.reshape.wide.generate.trimesterly.repeated <- function(data, dict_kind) { + # workaround to avoid glpobal variable warnings, check: + # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when + orig_var <- value <- age_trimester <- . <- NULL + + message("* Generating: trimesterly-repeated measures") + + variables_trimesterly_repeated_dict <- du.retrieve.dictionaries( + du.enum.table.types()$TRIMESTER, + dict_kind + ) + matched_columns <- du.match.columns(colnames(data), variables_trimesterly_repeated_dict$name) + trimesterly_repeated_measures <- data[, matched_columns] + + if (nrow(du.data.frame.remove.all.na.rows(trimesterly_repeated_measures)) <= 0) { + message("[WARNING] No trimesterly-repeated measures found in this set") + return() + } + + long_1 <- trimesterly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns != + "child_id"], na.rm = TRUE) + + # Create the age_years and age_months variables with the regular expression + # extraction of the year + long_1$age_trimester <- as.numeric(du.num.extract(long_1$orig_var)) + + # Here we remove the year indicator from the original variable name + long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var) + + raw <- unique(gsub("[[:digit:]]+$", "", colnames(trimesterly_repeated_measures))) + du.check.nas(unique(long_1$variable_trunc), raw) + + # Use the maditr package for spreading the data again, as tidyverse ruins into memory + # issues + long_2 <- dcast(long_1, child_id + age_trimester ~ variable_trunc, value.var = "value") + + # As the data table is still too big for opal, remove those rows, that have only + # missing values, but keep all rows at age_years=0, so no child_id get's lost: + + # Subset of data with age_months = 0 + one_trimesterly <- long_2 %>% filter(age_trimester %in% 1) + + for (id in unique(trimesterly_repeated_measures$child_id)) { + if (!(id %in% one_trimesterly$child_id)) { + one_trimesterly %<>% summarise(child_id = id, age_trimester = 1) %>% bind_rows( + one_trimesterly, + . + ) + } + } + + # Subset of data with age_months > 0 + later_trimesterly <- long_2 %>% filter(age_trimester > 1) + + long_2 <- rbind(one_trimesterly, later_trimesterly) + + # Create a row_id so there is a unique identifier for the rows + long_2$row_id <- c(1:length(long_2$child_id)) + + # Arrange the variable names based on the original order + long_trimesterly <- long_2[, c("row_id", "child_id", "age_trimester", unique(long_1$variable_trunc))] + + return(as.data.frame(long_trimesterly)) +} diff --git a/R/upload.R b/R/upload.R index 423c93b..f2e2a67 100644 --- a/R/upload.R +++ b/R/upload.R @@ -13,6 +13,7 @@ ds_upload.globals <- new.env() #' @param data_input_path path to the to-be-reshaped data #' @param action action to be performed, can be 'reshape', 'populate' or 'all' #' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE +#' @param data_type default = wide, can be long as well #' #' @examples #' \dontrun{ @@ -28,7 +29,7 @@ ds_upload.globals <- new.env() #' @export du.upload <- function(dict_version = "2_1", data_version = "1_0", dict_kind = du.enum.dict.kind()$CORE, cohort_id, database_name = "opal_data", data_input_format = du.enum.input.format()$CSV, data_input_path, - action = du.enum.action()$ALL, upload = TRUE, run_mode = du.enum.run.mode()$NORMAL) { + action = du.enum.action()$ALL, upload = TRUE, run_mode = du.enum.run.mode()$NORMAL, data_type = du.enum.data.type()$WIDE) { message("######################################################") message(" Start upload data into DataSHIELD backend") message("------------------------------------------------------") @@ -100,8 +101,10 @@ du.upload <- function(dict_version = "2_1", data_version = "1_0", dict_kind = du if (missing(data_input_format)) { data_input_format <- du.enum.input.format()$CSV } + + du.reshape( - upload, project, data_version, data_input_format, dict_version, + upload, project, data_version, data_type, data_input_format, dict_version, dict_kind, data_input_path, run_mode ) } From 9ba07405dbec0c84699fefb97c6b7af2a83bb7cd Mon Sep 17 00:00:00 2001 From: Sido Haakma Date: Thu, 24 Jun 2021 12:27:30 +0200 Subject: [PATCH 2/5] chore: add test cohort --- R/enums.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/enums.R b/R/enums.R index 752441c..06a8b4c 100644 --- a/R/enums.R +++ b/R/enums.R @@ -6,7 +6,7 @@ du.enum.cohorts <- function() { DNBC = "dnbc", GECKO = "gecko", ALSPAC = "alspac", GENR = "genr", MOBA = "moba", SWS = "sws", BIB = "bib", CHOP = "chop", ELFE = "elfe", EDEN = "eden", NINFEA = "ninfea", HBCS = "hbcs", INMA = "inma", ISGLOBAL = "isglobal", NBFC66 = "nfbc66", NBFC86 = "nfbc86", RAINE = "raine", RHEA = "rhea", ABCD = "abcd", BISC = "bisc", ENVIRONAGE = "environage", KANC = "kanc", PELAGIE = "pelagie", SEPAGES = "sepages", TNG = "tng", HGS = "hgs", RECETOX = "recetox", - GENXXI = "genxxi" + GENXXI = "genxxi", TEST = "test" ) } From 96d053e11e400973083dc695e3836ed817745621 Mon Sep 17 00:00:00 2001 From: Sido Haakma Date: Tue, 20 Jul 2021 13:04:10 +0200 Subject: [PATCH 3/5] feat: added genesis --- DESCRIPTION | 2 +- R/enums.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 55f2c53..caeb069 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dsUpload Title: Upload Functions for DataSHIELD Backends -Version: 4.2.2 +Version: 4.3.0 Authors@R: c(person(given = "Sido", family = "Haakma", diff --git a/R/enums.R b/R/enums.R index b4e6adf..55e6bbd 100644 --- a/R/enums.R +++ b/R/enums.R @@ -6,7 +6,7 @@ du.enum.cohorts <- function() { DNBC = "dnbc", GECKO = "gecko", ALSPAC = "alspac", GENR = "genr", MOBA = "moba", SWS = "sws", BIB = "bib", CHOP = "chop", ELFE = "elfe", EDEN = "eden", NINFEA = "ninfea", HBCS = "hbcs", INMA = "inma", ISGLOBAL = "isglobal", NBFC66 = "nfbc66", NBFC86 = "nfbc86", RAINE = "raine", RHEA = "rhea", ABCD = "abcd", BISC = "bisc", ENVIRONAGE = "environage", KANC = "kanc", PELAGIE = "pelagie", SEPAGES = "sepages", TNG = "tng", HGS = "hgs", RECETOX = "recetox", - GENXXI = "genxxi" + GENXXI = "genxxi", GENESIS="genesis" ) } From def7625592bb05f5989b83fb50ba5588c1d4f394 Mon Sep 17 00:00:00 2001 From: Sido Haakma Date: Wed, 3 Nov 2021 12:17:50 +0100 Subject: [PATCH 4/5] chore: remove unused lazydata prop --- DESCRIPTION | 1 - 1 file changed, 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 28a378e..be4ae68 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -72,5 +72,4 @@ Additional_repositories: VignetteBuilder: knitr License: GPL-3 Encoding: UTF-8 -LazyData: true RoxygenNote: 7.1.2 From 57b5c1b92a9c0528d00126c9d036017ab666ec60 Mon Sep 17 00:00:00 2001 From: Sido Haakma Date: Wed, 3 Nov 2021 12:22:40 +0100 Subject: [PATCH 5/5] docs: rerendered docs --- man/du.upload.Rd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/man/du.upload.Rd b/man/du.upload.Rd index dfa47b2..9cf9de9 100644 --- a/man/du.upload.Rd +++ b/man/du.upload.Rd @@ -14,7 +14,8 @@ du.upload( data_input_path, action = du.enum.action()$ALL, upload = TRUE, - run_mode = du.enum.run.mode()$NORMAL + run_mode = du.enum.run.mode()$NORMAL, + data_type = du.enum.data.type()$WIDE ) } \arguments{ @@ -37,6 +38,8 @@ du.upload( \item{upload}{do you want to upload the data (true or false)} \item{run_mode}{default = NORMAL, can be TEST and NON_INTERACTIIVE} + +\item{data_type}{default = wide, can be long as well} } \description{ Upload dictionaries and data into the DataSHIELD backend.