From e16fc5b59c4a3bba1bd7e0e76901a820fa1d4d0f Mon Sep 17 00:00:00 2001
From: Sido Haakma <sido@haakma.org>
Date: Tue, 22 Jun 2021 10:21:18 +0200
Subject: [PATCH 1/5] feat: can work with long format as well

---
 R/enums.R                               |   7 +
 R/reshape.R                             |  68 +++-
 R/{reshape_helpers.R => reshape_long.R} |  10 +-
 R/reshape_wide.R                        | 480 ++++++++++++++++++++++++
 R/upload.R                              |   7 +-
 5 files changed, 548 insertions(+), 24 deletions(-)
 rename R/{reshape_helpers.R => reshape_long.R} (97%)
 create mode 100644 R/reshape_wide.R

diff --git a/R/enums.R b/R/enums.R
index b4e6adf..752441c 100644
--- a/R/enums.R
+++ b/R/enums.R
@@ -38,6 +38,13 @@ du.enum.dict.kind <- function() {
   list(CORE = "core", OUTCOME = "outcome", BETA = "beta")
 }
 
+#' Source data types
+#'
+#' @noRd
+du.enum.data.type <- function() {
+  list(LONG = "long", WIDE = "wide")
+}
+
 #' Projects that are containing dictionaries. Repositories containing these dictionaries should be:
 #'
 #' - ds-dictionaries
diff --git a/R/reshape.R b/R/reshape.R
index 4a0615d..1659c7b 100755
--- a/R/reshape.R
+++ b/R/reshape.R
@@ -2,18 +2,20 @@
 #'
 #' @param upload do you want automatically upload the files to your opal (default = true)
 #' @param data_version version of the data you are going to upload into Opal
+#' @param data_type default = WIDE, can be LONG as well
 #' @param input_format possible formats are CSV,STATA,SPSS or SAS (default = CSV)
 #' @param dict_version version of the dictionary
 #' @param dict_kind kind of data to reshape (default = core)
 #' @param input_path path for import file
 #' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE
+
 #'
 #' @importFrom readxl read_xlsx
 #'
 #' @noRd
-du.reshape <- function(upload = TRUE, project, data_version, input_format, dict_version, dict_kind, input_path, run_mode) {
+du.reshape <- function(upload = TRUE, project, data_version, data_type, input_format, dict_version, dict_kind, input_path, run_mode) {
   message("######################################################")
-  message("  Start converting and uploading data                 ")
+  message(paste0("  Start converting and uploading data  ( ", data_type, "-format )"))
   message("######################################################")
   message("* Setup: load data and set output directory")
   message("------------------------------------------------------")
@@ -28,24 +30,50 @@ du.reshape <- function(upload = TRUE, project, data_version, input_format, dict_
   file_name_monthly <- paste0(file_prefix, "_", data_version, "_", "monthly_repeated_measures")
   file_name_yearly <- paste0(file_prefix, "_", data_version, "_", "yearly_repeated_measures")
 
-  nonrep_data <- du.reshape.generate.non.repeated(
-    data, dict_kind
-  )
+  if(data_type == du.enum.data.type()$WIDE) {
+    nonrep_data <- du.reshape.wide.generate.non.repeated(
+      data, dict_kind
+    )
+  } else {
+    nonrep_data <- du.reshape.long.generate.non.repeated(
+      data, dict_kind
+    )
+  }
   if (!is.null(nonrep_data)) write_csv(nonrep_data, paste0(getwd(), "/", file_name_nonrep, ".csv"), na = "")
-  yearlyrep_data <- du.reshape.generate.yearly.repeated(
-    data, dict_kind
-  )
+  
+  if(data_type == du.enum.data.type()$WIDE) {
+    yearlyrep_data <- du.reshape.wide.generate.yearly.repeated(
+      data, dict_kind
+    )
+  } else {
+    #yearlyrep_data <- du.reshape.long.generate.yearly.repeated(
+    #  data, dict_kind
+    #)
+  }
   if (!is.null(yearlyrep_data)) write_csv(yearlyrep_data, paste0(getwd(), "/", file_name_yearly, ".csv"), na = "")
-  monthlyrep_data <- du.reshape.generate.monthly.repeated(
-    data, dict_kind
-  )
+  
+  if(data_type == du.enum.data.type()$WIDE) {
+    monthlyrep_data <- du.reshape.wide.generate.monthly.repeated(
+      data, dict_kind
+    )
+  } else {
+    #monthlyrep_data <- du.reshape.long.generate.monthly.repeated(
+    #  data, dict_kind
+    #)
+  }
   if (!is.null(monthlyrep_data)) write_csv(monthlyrep_data, paste0(getwd(), "/", file_name_monthly, ".csv"), na = "")
 
   if (dict_kind == du.enum.dict.kind()$OUTCOME) {
     file_name_weekly <- paste0(file_prefix, "_", data_version, "_", "weekly_repeated_measures")
-    weeklyrep_data <- du.reshape.generate.weekly.repeated(
-      data, dict_kind
-    )
+    if(data_type == du.enum.data.type()$WIDE) {
+      weeklyrep_data <- du.reshape.wide.generate.weekly.repeated(
+        data, dict_kind
+      )
+    } else {
+      #weeklyrep_data <- du.reshape.long.generate.weekly.repeated(
+      #  data, dict_kind
+      #)
+    }
     if (!is.null(weeklyrep_data)) {
       write_csv(weeklyrep_data, paste0(getwd(), "/", file_name_weekly, ".csv"), na = "")
       weeklyrep_metadata <- du.retrieve.full.dict(du.enum.table.types()$WEEKLY, dict_kind)
@@ -63,9 +91,15 @@ du.reshape <- function(upload = TRUE, project, data_version, input_format, dict_
 
   if (dict_kind == du.enum.dict.kind()$CORE & dict_version != "1_0") {
     file_name_trimester <- paste0(file_prefix, "_", data_version, "_", "trimester_repeated_measures")
-    trimester_data <- du.reshape.generate.trimesterly.repeated(
-      data, dict_kind
-    )
+    if(data_type == du.enum.data.type()$WIDE) {
+      trimester_data <- du.reshape.wide.generate.trimesterly.repeated(
+        data, dict_kind
+      )
+    } else {
+      #trimester_data <- du.reshape.long.generate.trimesterly.repeated(
+      #  data, dict_kind
+      #)
+    }
     if (!is.null(trimester_data)) {
       write_csv(trimester_data, paste0(getwd(), "/", file_name_trimester, ".csv"), na = "")
       trimester_metadata <- du.retrieve.full.dict(du.enum.table.types()$TRIMESTER, dict_kind)
diff --git a/R/reshape_helpers.R b/R/reshape_long.R
similarity index 97%
rename from R/reshape_helpers.R
rename to R/reshape_long.R
index a58b18a..310820d 100644
--- a/R/reshape_helpers.R
+++ b/R/reshape_long.R
@@ -156,7 +156,7 @@ du.check.nas <- function(stripped, raw) {
 #' @importFrom readxl read_xlsx
 #'
 #' @noRd
-du.reshape.generate.non.repeated <- function(data, dict_kind) {
+du.reshape.long.generate.non.repeated <- function(data, dict_kind) {
   message("* Generating: non-repeated measures")
 
   # Retrieve dictionary
@@ -192,7 +192,7 @@ du.reshape.generate.non.repeated <- function(data, dict_kind) {
 #' @importFrom tidyr gather
 #'
 #' @noRd
-du.reshape.generate.yearly.repeated <- function(data, dict_kind) {
+du.reshape.long.generate.yearly.repeated <- function(data, dict_kind) {
   # workaround to avoid glpobal variable warnings, check:
   # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
   orig_var <- value <- age_years <- . <- NULL
@@ -264,7 +264,7 @@ du.reshape.generate.yearly.repeated <- function(data, dict_kind) {
 #' @importFrom tidyr gather
 #'
 #' @noRd
-du.reshape.generate.monthly.repeated <- function(data, dict_kind) {
+du.reshape.long.generate.monthly.repeated <- function(data, dict_kind) {
   # workaround to avoid glpobal variable warnings, check:
   # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
   orig_var <- value <- age_months <- . <- NULL
@@ -338,7 +338,7 @@ du.reshape.generate.monthly.repeated <- function(data, dict_kind) {
 #' @importFrom tidyr gather
 #'
 #' @noRd
-du.reshape.generate.weekly.repeated <- function(data, dict_kind) {
+du.reshape.long.generate.weekly.repeated <- function(data, dict_kind) {
   # workaround to avoid glpobal variable warnings, check:
   # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
   orig_var <- value <- age_weeks <- . <- NULL # Gestational age in weeks
@@ -414,7 +414,7 @@ du.reshape.generate.weekly.repeated <- function(data, dict_kind) {
 #' @importFrom tidyr gather
 #'
 #' @noRd
-du.reshape.generate.trimesterly.repeated <- function(data, dict_kind) {
+du.reshape.long.generate.trimesterly.repeated <- function(data, dict_kind) {
   # workaround to avoid glpobal variable warnings, check:
   # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
   orig_var <- value <- age_trimester <- . <- NULL
diff --git a/R/reshape_wide.R b/R/reshape_wide.R
new file mode 100644
index 0000000..c2b03f3
--- /dev/null
+++ b/R/reshape_wide.R
@@ -0,0 +1,480 @@
+#' Read the input file from different sources
+#'
+#' @param input_format possible formats are CSV,STATA,SPSS or SAS (default = CSV)
+#' @param input_path path for importfile
+#'
+#' @importFrom readr read_csv cols col_double
+#' @importFrom haven read_dta read_sas read_spss
+#'
+#' @return dataframe with source data
+#'
+#' @noRd
+du.read.source.file <- function(input_path, input_format) {
+  du_data <- NULL
+
+  if (input_format %in% du.enum.input.format()) {
+    if (input_format == du.enum.input.format()$STATA) {
+      data <- read_dta(input_path)
+    } else if (input_format == du.enum.input.format()$SPSS) {
+      data <- read_spss(input_path)
+    } else if (input_format == du.enum.input.format()$SAS) {
+      data <- read_sas(input_path)
+    } else if (input_format == du.enum.input.format()$R) {
+      data <- source(input_path)
+    } else {
+      data <- read_csv(input_path, col_types = cols(.default = col_double()))
+    }
+  } else {
+    stop(paste0(
+      input_format, " is not a valid input format, Possible input formats are: ",
+      paste(du.enum.input.format(), collapse = ", ")
+    ))
+  }
+
+  return(data)
+}
+
+#' Get the table without rows containing only NA's.
+#'
+#' We have to remove the first column (child_id), that is generated always.
+#'
+#' @param dataframe dataframe to check
+#'
+#' @importFrom dplyr %>%
+#'
+#' @return dataframe without the na values
+#'
+#' @noRd
+du.data.frame.remove.all.na.rows <- function(dataframe) {
+  df <- dataframe[-c(1)]
+
+  naLines <- df %>%
+    is.na() %>%
+    apply(MARGIN = 1, FUN = all)
+
+  return(df[!naLines, ])
+}
+#'
+#' Matched the columns in the source data.
+#' You can then match the found column against the dictionary.
+#'
+#' @param data_columns columns obtained from raw data
+#' @param dict_columns columns matched in the dictionary
+#'
+#' @importFrom stringr str_subset
+#'
+#' @return matched_columns in source data
+#'
+#' @noRd
+du.match.columns <- function(data_columns, dict_columns) {
+  matched_columns <- character()
+
+  matched_columns <- data_columns[data_columns %in% dict_columns]
+
+  for (variable in dict_columns) {
+    matched_columns <- c(matched_columns, data_columns %>% str_subset(pattern = paste0("^",
+      variable, "\\d+",
+      sep = ""
+    )))
+  }
+  # Select the non-repeated measures from the full data set
+  return(matched_columns)
+}
+
+#'
+#' Check if there are columns not matching the dictionary.
+#'
+#' @param dict_kind specify which dictionary you want to check
+#' @param data_columns the coiumns within the data
+#' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE
+#'
+#' @return stops the program if someone terminates
+#'
+#' @noRd
+du.check.variables <- function(dict_kind, data_columns, run_mode) {
+  variables <- du.retrieve.dictionaries(dict_kind = dict_kind)
+
+  matched_columns <- du.match.columns(data_columns, variables$name)
+
+  columns_not_matched <- data_columns[!(data_columns %in% matched_columns)]
+
+  if (length(columns_not_matched) > 0) {
+    message(paste0("[WARNING] This is an unmatched column, it will be dropped : [ ", columns_not_matched, " ].", sep = '\n'))
+    if (run_mode != du.enum.run.mode()$NON_INTERACTIVE) {
+      proceed <- readline("Do you want to proceed (y/n)")
+    } else {
+      proceed <- "y"
+    }
+  } else {
+    proceed <- "y"
+  }
+  if (proceed == "n") {
+    message(paste0(columns_not_matched, sep = '\n'))
+    stop("Program is terminated. There are unmatched columns in your source data.")
+  }
+}
+
+#' Check for NA columns
+#'
+#' @param stripped variables without NA values
+#' @param raw original variables
+#' @param run_mode the run mode of the package
+#'
+#' @return stops the program if someone terminates
+#' 
+#' @noRd
+du.check.nas <- function(stripped, raw) {
+  
+  # remove child_id
+  raw <- raw[-1]
+  
+  variables_na <- raw[!(raw %in% stripped)]
+
+  if (length(variables_na) > 0) {
+    message(paste0("[WARNING] Variable dropped because completely missing: [ ", variables_na, " ]", sep = '\n'))
+    if (ds_upload.globals$run_mode != du.enum.run.mode()$NON_INTERACTIVE) {
+      proceed <- readline("Do you want to proceed (y/n)")
+    } else {
+      proceed <- "y"
+    }
+  } else {
+    proceed <- "y"
+  }
+  if (proceed == "n") {
+    message(paste0(variables_na, sep = '\n'))
+    stop("Program is terminated. There are columns in your source data that are completely missing.")
+  }
+}
+
+#' Generate the yearly repeated measures file and write it to your local workspace
+#'
+#' @param data data frame with all the data based upon the CSV file
+#' @param dict_kind can be 'core' or 'outcome'
+#'
+#' @importFrom readr write_csv
+#' @importFrom dplyr %>%
+#' @importFrom readxl read_xlsx
+#'
+#' @noRd
+du.reshape.wide.generate.non.repeated <- function(data, dict_kind) {
+  message("* Generating: non-repeated measures")
+
+  # Retrieve dictionary
+  variables_non_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$NONREP, dict_kind)
+
+  # select the non-repeated measures from the full data set
+  non_repeated <- c("child_id", variables_non_repeated_dict$name)
+  non_repeated_measures <- data[, which(colnames(data) %in% non_repeated)]
+
+  # strip the rows with na values
+  stripped_non_repeated_measures <- non_repeated_measures[, colSums(is.na(non_repeated_measures)) <
+    nrow(non_repeated_measures)]
+  
+  du.check.nas(colnames(stripped_non_repeated_measures), colnames(non_repeated_measures))
+  
+  # add row_id again to preserve child_id
+  stripped_non_repeated_measures <- data.frame(
+    row_id = c(1:length(non_repeated_measures$child_id)),
+    non_repeated_measures
+  )
+
+  return(as.data.frame(stripped_non_repeated_measures))
+}
+
+#' Generate the yearly repeated measures file and write it to your local workspace
+#'
+#' @param data data frame with all the data based upon the CSV file
+#' @param dict_kind can be 'core' or 'outcome'
+#'
+#' @importFrom readr write_csv
+#' @importFrom dplyr %>% filter summarise bind_rows
+#' @importFrom maditr dcast as.data.table %<>%
+#' @importFrom tidyr gather
+#'
+#' @noRd
+du.reshape.wide.generate.yearly.repeated <- function(data, dict_kind) {
+  # workaround to avoid glpobal variable warnings, check:
+  # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
+  orig_var <- value <- age_years <- . <- NULL
+
+  message("* Generating: yearly-repeated measures")
+
+  variables_yearly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$YEARLY, dict_kind)
+  matched_columns <- du.match.columns(colnames(data), variables_yearly_repeated_dict$name)
+  yearly_repeated_measures <- data[matched_columns]
+
+  if (nrow(du.data.frame.remove.all.na.rows(yearly_repeated_measures)) <= 0) {
+    message("[WARNING] No yearly-repeated measures found in this set")
+    return()
+  }
+
+  long_1 <- yearly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
+    "child_id"], na.rm = TRUE)
+
+  # Create the age_years variable with the regular expression extraction of the year
+  long_1$age_years <- as.numeric(du.num.extract(long_1$orig_var))
+
+  # Here we remove the year indicator from the original variable name
+  long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
+
+  raw <- unique(gsub("[[:digit:]]+$", "", colnames(yearly_repeated_measures)))
+  du.check.nas(unique(long_1$variable_trunc), raw)
+  
+  # Use the maditr package for spreading the data again, as tidyverse runs into memory
+  # issues
+  long_2 <- dcast(long_1, child_id + age_years ~ variable_trunc, value.var = "value")
+
+  # As the data table is still too big for opal, remove those rows, that have only
+  # missing values, but keep all rows at age_years=0, so no child_id get's lost:
+
+  # Subset of data with age_years = 0
+  zero_year <- long_2 %>% filter(age_years %in% 0)
+
+  for (id in unique(yearly_repeated_measures$child_id)) {
+    if (!(id %in% zero_year$child_id)) {
+      zero_year %<>% summarise(child_id = id, age_years = 0) %>% bind_rows(
+        zero_year,
+      )
+    }
+  }
+
+  # Subset of data with age_years > 0
+  later_year <- long_2 %>% filter(age_years > 0)
+
+  # Bind the 0 year and older data sets together
+  long_2 <- rbind(zero_year, later_year)
+
+  # Create a row_id so there is a unique identifier for the rows
+  long_2$row_id <- c(1:length(long_2$child_id))
+
+  # Arrange the variable names based on the original order
+  long_yearly <- long_2[, c("row_id", "child_id", "age_years", unique(long_1$variable_trunc))]
+
+  return(as.data.frame(long_yearly))
+}
+
+#' Generate the monthly repeated measures file and write it to your local workspace
+#'
+#' @param data data frame with all the data based upon the CSV file
+#' @param dict_kind can be 'core' or 'outcome'
+#'
+#' @importFrom readr write_csv
+#' @importFrom dplyr %>% filter summarise bind_rows
+#' @importFrom maditr dcast as.data.table %<>%
+#' @importFrom tidyr gather
+#'
+#' @noRd
+du.reshape.wide.generate.monthly.repeated <- function(data, dict_kind) {
+  # workaround to avoid glpobal variable warnings, check:
+  # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
+  orig_var <- value <- age_months <- . <- NULL
+
+  message("* Generating: monthly-repeated measures")
+
+  variables_monthly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$MONTHLY, dict_kind)
+  matched_columns <- du.match.columns(colnames(data), variables_monthly_repeated_dict$name)
+  monthly_repeated_measures <- data[, matched_columns]
+
+  if (nrow(du.data.frame.remove.all.na.rows(monthly_repeated_measures)) <= 0) {
+    message("[WARNING] No monthly-repeated measures found in this set")
+    return()
+  }
+
+  long_1 <- monthly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
+    "child_id"], na.rm = TRUE)
+
+  # Create the age_years and age_months variables with the regular expression
+  # extraction of the year
+  long_1$age_years <- as.integer(as.numeric(du.num.extract(long_1$orig_var)) / 12)
+  long_1$age_months <- as.numeric(du.num.extract(long_1$orig_var))
+
+  # Here we remove the year indicator from the original variable name
+  long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
+
+  raw <- unique(gsub("[[:digit:]]+$", "", colnames(monthly_repeated_measures)))
+  du.check.nas(unique(long_1$variable_trunc), raw)
+  
+  # Use the maditr package for spreading the data again, as tidyverse ruins into memory
+  # issues
+  long_2 <- dcast(long_1, child_id + age_years + age_months ~ variable_trunc, value.var = "value")
+
+  # As the data table is still too big for opal, remove those rows, that have only
+  # missing values, but keep all rows at age_years=0, so no child_id get's lost:
+
+  # Subset of data with age_months = 0
+  zero_monthly <- long_2 %>% filter(age_months %in% 0)
+
+  for (id in unique(monthly_repeated_measures$child_id)) {
+    if (!(id %in% zero_monthly$child_id)) {
+      zero_monthly %<>% summarise(child_id = id, age_months = 0) %>% bind_rows(
+        zero_monthly,
+      )
+    }
+  }
+
+  # Subset of data with age_months > 0
+  later_monthly <- long_2 %>% filter(age_months > 0)
+
+  # Bind the 0 year and older data sets together
+  long_2 <- rbind(zero_monthly, later_monthly)
+
+  # Create a row_id so there is a unique identifier for the rows
+  long_2$row_id <- c(1:length(long_2$child_id))
+
+  # Arrange the variable names based on the original order
+  long_monthly <- long_2[, c("row_id", "child_id", "age_years", "age_months", unique(long_1$variable_trunc))]
+
+  return(as.data.frame(long_monthly))
+}
+
+#' Generate the weekly repeated measures file and write it to your local workspace
+#'
+#' @param data data frame with all the data based upon the CSV file
+#' @param dict_kind can be 'core' or 'outcome'
+#'
+#' @importFrom readr write_csv
+#' @importFrom dplyr %>% filter summarise bind_rows
+#' @importFrom maditr dcast as.data.table %<>%
+#' @importFrom tidyr gather
+#'
+#' @noRd
+du.reshape.wide.generate.weekly.repeated <- function(data, dict_kind) {
+  # workaround to avoid glpobal variable warnings, check:
+  # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
+  orig_var <- value <- age_weeks <- . <- NULL # Gestational age in weeks
+
+  message("* Generating: weekly-repeated measures")
+
+  variables_weekly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$WEEKLY, dict_kind)
+  matched_columns <- du.match.columns(colnames(data), variables_weekly_repeated_dict$name)
+  weekly_repeated_measures <- data[, matched_columns]
+
+  if (nrow(du.data.frame.remove.all.na.rows(weekly_repeated_measures)) <= 0) {
+    message("[WARNING] No weekly-repeated measures found in this set")
+    return()
+  }
+
+  long_1 <- weekly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
+    "child_id"], na.rm = TRUE)
+
+  # Create the age_years and age_months variables with the regular expression
+  # extraction of the year NB - these weekly dta are pregnancy related so child is NOT
+  # BORN YET ---
+  long_1$age_years <- as.integer(as.numeric(du.num.extract(long_1$orig_var)) / 52)
+  long_1$age_weeks <- as.integer(du.num.extract(long_1$orig_var))
+
+  # Here we remove the year indicator from the original variable name
+  long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
+  
+  raw <- unique(gsub("[[:digit:]]+$", "", colnames(weekly_repeated_measures)))
+  du.check.nas(unique(long_1$variable_trunc), raw)
+
+  # Use the maditr package for spreading the data again, as tidyverse ruins into memory
+  # issues
+  long_2 <- dcast(long_1, child_id + age_years + age_weeks ~ variable_trunc, value.var = "value")
+
+  # As the data table is still too big for opal, remove those rows, that have only
+  # missing values, but keep all rows at age_years=0, so no child_id get's lost:
+
+  # Subset of data with age_months = 0
+  zero_weekly <- long_2 %>% filter(age_weeks %in% 0)
+
+  for (id in unique(weekly_repeated_measures$child_id)) {
+    if (!(id %in% zero_weekly$child_id)) {
+      zero_weekly %<>% summarise(child_id = id, age_weeks = 0) %>% bind_rows(
+        zero_weekly,
+      )
+    }
+  }
+
+  # Subset of data with age_months > 0
+  later_weekly <- long_2 %>% filter(age_weeks > 0)
+
+  # Bind the 0 year and older data sets together
+  long_2 <- rbind(zero_weekly, later_weekly)
+
+  # Create a row_id so there is a unique identifier for the rows
+  long_2$row_id <- c(1:length(long_2$child_id))
+
+  # Arrange the variable names based on the original order
+  long_weekly <- long_2[, c("row_id", "child_id", "age_years", "age_weeks", unique(long_1$variable_trunc))]
+
+  return(as.data.frame(long_weekly))
+}
+
+
+#' Generate the trimesterly repeated measures file and write it to your local workspace
+#'
+#' @param data data frame with all the data based upon the CSV file
+#' @param dict_kind can be 'core' or 'outcome'
+#'
+#' @importFrom readr write_csv
+#' @importFrom dplyr %>% filter summarise bind_rows
+#' @importFrom maditr dcast as.data.table %<>%
+#' @importFrom tidyr gather
+#'
+#' @noRd
+du.reshape.wide.generate.trimesterly.repeated <- function(data, dict_kind) {
+  # workaround to avoid glpobal variable warnings, check:
+  # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
+  orig_var <- value <- age_trimester <- . <- NULL
+
+  message("* Generating: trimesterly-repeated measures")
+
+  variables_trimesterly_repeated_dict <- du.retrieve.dictionaries(
+    du.enum.table.types()$TRIMESTER,
+    dict_kind
+  )
+  matched_columns <- du.match.columns(colnames(data), variables_trimesterly_repeated_dict$name)
+  trimesterly_repeated_measures <- data[, matched_columns]
+
+  if (nrow(du.data.frame.remove.all.na.rows(trimesterly_repeated_measures)) <= 0) {
+    message("[WARNING] No trimesterly-repeated measures found in this set")
+    return()
+  }
+
+  long_1 <- trimesterly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
+    "child_id"], na.rm = TRUE)
+
+  # Create the age_years and age_months variables with the regular expression
+  # extraction of the year
+  long_1$age_trimester <- as.numeric(du.num.extract(long_1$orig_var))
+
+  # Here we remove the year indicator from the original variable name
+  long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
+  
+  raw <- unique(gsub("[[:digit:]]+$", "", colnames(trimesterly_repeated_measures)))
+  du.check.nas(unique(long_1$variable_trunc), raw)
+
+  # Use the maditr package for spreading the data again, as tidyverse ruins into memory
+  # issues
+  long_2 <- dcast(long_1, child_id + age_trimester ~ variable_trunc, value.var = "value")
+
+  # As the data table is still too big for opal, remove those rows, that have only
+  # missing values, but keep all rows at age_years=0, so no child_id get's lost:
+
+  # Subset of data with age_months = 0
+  one_trimesterly <- long_2 %>% filter(age_trimester %in% 1)
+
+  for (id in unique(trimesterly_repeated_measures$child_id)) {
+    if (!(id %in% one_trimesterly$child_id)) {
+      one_trimesterly %<>% summarise(child_id = id, age_trimester = 1) %>% bind_rows(
+        one_trimesterly,
+        .
+      )
+    }
+  }
+
+  # Subset of data with age_months > 0
+  later_trimesterly <- long_2 %>% filter(age_trimester > 1)
+
+  long_2 <- rbind(one_trimesterly, later_trimesterly)
+
+  # Create a row_id so there is a unique identifier for the rows
+  long_2$row_id <- c(1:length(long_2$child_id))
+
+  # Arrange the variable names based on the original order
+  long_trimesterly <- long_2[, c("row_id", "child_id", "age_trimester", unique(long_1$variable_trunc))]
+
+  return(as.data.frame(long_trimesterly))
+}
diff --git a/R/upload.R b/R/upload.R
index 423c93b..f2e2a67 100644
--- a/R/upload.R
+++ b/R/upload.R
@@ -13,6 +13,7 @@ ds_upload.globals <- new.env()
 #' @param data_input_path path to the to-be-reshaped data
 #' @param action action to be performed, can be 'reshape', 'populate' or 'all'
 #' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE
+#' @param data_type default = wide, can be long as well
 #'
 #' @examples
 #' \dontrun{
@@ -28,7 +29,7 @@ ds_upload.globals <- new.env()
 #' @export
 du.upload <- function(dict_version = "2_1", data_version = "1_0", dict_kind = du.enum.dict.kind()$CORE,
                       cohort_id, database_name = "opal_data", data_input_format = du.enum.input.format()$CSV, data_input_path,
-                      action = du.enum.action()$ALL, upload = TRUE, run_mode = du.enum.run.mode()$NORMAL) {
+                      action = du.enum.action()$ALL, upload = TRUE, run_mode = du.enum.run.mode()$NORMAL, data_type = du.enum.data.type()$WIDE) {
   message("######################################################")
   message("  Start upload data into DataSHIELD backend")
   message("------------------------------------------------------")
@@ -100,8 +101,10 @@ du.upload <- function(dict_version = "2_1", data_version = "1_0", dict_kind = du
         if (missing(data_input_format)) {
           data_input_format <- du.enum.input.format()$CSV
         }
+        
+        
         du.reshape(
-          upload, project, data_version, data_input_format, dict_version,
+          upload, project, data_version, data_type, data_input_format, dict_version,
           dict_kind, data_input_path, run_mode
         )
       }

From 9ba07405dbec0c84699fefb97c6b7af2a83bb7cd Mon Sep 17 00:00:00 2001
From: Sido Haakma <sido@haakma.org>
Date: Thu, 24 Jun 2021 12:27:30 +0200
Subject: [PATCH 2/5] chore: add test cohort

---
 R/enums.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/enums.R b/R/enums.R
index 752441c..06a8b4c 100644
--- a/R/enums.R
+++ b/R/enums.R
@@ -6,7 +6,7 @@ du.enum.cohorts <- function() {
     DNBC = "dnbc", GECKO = "gecko", ALSPAC = "alspac", GENR = "genr", MOBA = "moba", SWS = "sws", BIB = "bib", CHOP = "chop", ELFE = "elfe",
     EDEN = "eden", NINFEA = "ninfea", HBCS = "hbcs", INMA = "inma", ISGLOBAL = "isglobal", NBFC66 = "nfbc66", NBFC86 = "nfbc86", RAINE = "raine", RHEA = "rhea",
     ABCD = "abcd", BISC = "bisc", ENVIRONAGE = "environage", KANC = "kanc", PELAGIE = "pelagie", SEPAGES = "sepages", TNG = "tng", HGS = "hgs", RECETOX = "recetox", 
-    GENXXI = "genxxi"
+    GENXXI = "genxxi", TEST = "test"
   )
 }
 

From 96d053e11e400973083dc695e3836ed817745621 Mon Sep 17 00:00:00 2001
From: Sido Haakma <sido@haakma.org>
Date: Tue, 20 Jul 2021 13:04:10 +0200
Subject: [PATCH 3/5] feat: added genesis

---
 DESCRIPTION | 2 +-
 R/enums.R   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 55f2c53..caeb069 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: dsUpload
 Title: Upload Functions for DataSHIELD Backends
-Version: 4.2.2
+Version: 4.3.0
 Authors@R: 
   c(person(given = "Sido",
       family = "Haakma",
diff --git a/R/enums.R b/R/enums.R
index b4e6adf..55e6bbd 100644
--- a/R/enums.R
+++ b/R/enums.R
@@ -6,7 +6,7 @@ du.enum.cohorts <- function() {
     DNBC = "dnbc", GECKO = "gecko", ALSPAC = "alspac", GENR = "genr", MOBA = "moba", SWS = "sws", BIB = "bib", CHOP = "chop", ELFE = "elfe",
     EDEN = "eden", NINFEA = "ninfea", HBCS = "hbcs", INMA = "inma", ISGLOBAL = "isglobal", NBFC66 = "nfbc66", NBFC86 = "nfbc86", RAINE = "raine", RHEA = "rhea",
     ABCD = "abcd", BISC = "bisc", ENVIRONAGE = "environage", KANC = "kanc", PELAGIE = "pelagie", SEPAGES = "sepages", TNG = "tng", HGS = "hgs", RECETOX = "recetox", 
-    GENXXI = "genxxi"
+    GENXXI = "genxxi", GENESIS="genesis"
   )
 }
 

From def7625592bb05f5989b83fb50ba5588c1d4f394 Mon Sep 17 00:00:00 2001
From: Sido Haakma <sido@haakma.org>
Date: Wed, 3 Nov 2021 12:17:50 +0100
Subject: [PATCH 4/5] chore: remove unused lazydata prop

---
 DESCRIPTION | 1 -
 1 file changed, 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 28a378e..be4ae68 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -72,5 +72,4 @@ Additional_repositories:
 VignetteBuilder: knitr
 License: GPL-3
 Encoding: UTF-8
-LazyData: true
 RoxygenNote: 7.1.2

From 57b5c1b92a9c0528d00126c9d036017ab666ec60 Mon Sep 17 00:00:00 2001
From: Sido Haakma <sido@haakma.org>
Date: Wed, 3 Nov 2021 12:22:40 +0100
Subject: [PATCH 5/5] docs: rerendered docs

---
 man/du.upload.Rd | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/man/du.upload.Rd b/man/du.upload.Rd
index dfa47b2..9cf9de9 100644
--- a/man/du.upload.Rd
+++ b/man/du.upload.Rd
@@ -14,7 +14,8 @@ du.upload(
   data_input_path,
   action = du.enum.action()$ALL,
   upload = TRUE,
-  run_mode = du.enum.run.mode()$NORMAL
+  run_mode = du.enum.run.mode()$NORMAL,
+  data_type = du.enum.data.type()$WIDE
 )
 }
 \arguments{
@@ -37,6 +38,8 @@ du.upload(
 \item{upload}{do you want to upload the data (true or false)}
 
 \item{run_mode}{default = NORMAL, can be TEST and NON_INTERACTIIVE}
+
+\item{data_type}{default = wide, can be long as well}
 }
 \description{
 Upload dictionaries and data into the DataSHIELD backend.