WeeklyUpdates/DataWrangling.R at main · Jcbetts/WeeklyUpdates · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
library(charlatan)
library(dplyr)
library(readr)
library(stringr)
library(tibble)

PrepData <- read.csv("Data/October/Week 2/10-13-25.csv")

# Change column names
PrepData <- rename(PrepData, ID = Household..,
               Name = Member,
               Email = X,
               Pass_Type = Pass.Code,
               Facility = Visit.Loc)

# change date format for POSIX
PrepData$Time <-format(strptime(PrepData$Time, "%I:%M %p"), format="%H:%M:%S")
PrepData$Date <- as.Date(PrepData$Date, format = "%m/%d/%Y")

# create date + time column to compare
PrepData <- PrepData |>
  mutate(DateTime = as.POSIXct(paste(Date, Time), format = "%Y-%m-%d %H:%M:%S"))

# for facility visits, two-minute cool-down not active... manually identifying and removing
PrepData <- PrepData |>
  arrange(ID, Name, DateTime) |>
  group_by(ID, Name) |>
  mutate(
    TimeSinceSwipe = as.numeric(difftime(DateTime, lag(DateTime), units = "mins"))) |>
  ungroup()

# filter for swipe minutes > 2 or NA (which is a member's first swipe)
PrepData <- PrepData |>
  filter(TimeSinceSwipe > 2.00 | is.na(TimeSinceSwipe))

# facility assignment

PrepData$Facility <- str_replace(PrepData$Facility, "^(Turnstile|SM|SRC)$", "1")

PrepData$Facility <- str_replace(PrepData$Facility, "^(NREC-QCK|Turnstile-NREC|NREC)$", "2")

PrepData$Facility <- str_replace(PrepData$Facility, "^Turnstile-zBD$", "3")

PrepData$Facility <- str_replace(PrepData$Facility, "^Studio 91$", "4")

PrepData$Facility[PrepData$Facility == ""] <- "5"

# helpful to have a cleaned and unscrambled file to compare to cleaned and scrambled file to ensure nothing is lost...
write.csv(PrepData, "Data/CompareData.csv")

# Complete working function with detailed diagnostics
scramble_large_dataset_working <- function(data, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)

  cat("=== Starting Data Scrambling ===\n")
  cat("Original data:", nrow(data), "rows x", ncol(data), "columns\n")

  # Convert to character to avoid type issues
  PrepData$ID <- as.character(PrepData$ID)
  PrepData$Name <- as.character(PrepData$Name)
  PrepData$Email <- as.character(PrepData$Email)

  cat("Data types after conversion - ID:", class(PrepData$ID), "Name:", class(PrepData$Name), "Email:", class(PrepData$Email), "\n")

  # Remove NA values
  complete_cases <- complete.cases(PrepData[, c("ID", "Name", "Email")])
  df_clean <- PrepData[complete_cases, ]

  if (nrow(df_clean) != nrow(PrepData)) {
    cat("Removed", nrow(PrepData) - nrow(df_clean), "rows with missing values\n")
  }

  # Get unique combinations
  unique_people <- unique(df_clean[, c("ID", "Name", "Email")])
  n_unique <- nrow(unique_people)

  cat("Processing", n_unique, "unique individuals from", nrow(df_clean), "observations\n")

  # Add scrambled columns to unique_people dataframe
  unique_people$scrambled_id <- paste0("SUBJ_", sprintf("%06d", 1:n_unique))
  unique_people$scrambled_name <- paste("User", sample(1000:(1000+n_unique*2), n_unique))
  unique_people$scrambled_email <- paste0("user", sample(10000:99999, n_unique), "@example.com")

  # Use merge to join scrambled data back
  cat("Merging scrambled data back to original dataset...\n")
  df_merged <- merge(df_clean, unique_people, by = c("ID", "Name", "Email"), all.x = TRUE)

  cat("After merge:", nrow(df_merged), "rows\n")

  # Check for any failed matches
  failed_matches <- sum(is.na(df_merged$scrambled_id))
  if (failed_matches > 0) {
    cat("WARNING:", failed_matches, "rows failed to match!\n")
  } else {
    cat("SUCCESS: All rows matched successfully!\n")
  }

  # Remove original sensitive columns
  df_final <- df_merged[, !names(df_merged) %in% c("ID", "Name", "Email")]

  # Create clean mapping table
  mapping_table <- unique_people[, c("ID", "Name", "Email", "scrambled_id", "scrambled_name", "scrambled_email")]
  names(mapping_table)[1:3] <- c("original_id", "original_name", "original_email")

  cat("=== Scrambling Complete ===\n")
  cat("Final dataset:", nrow(df_final), "rows\n")
  cat("Unique scrambled IDs:", length(unique(df_final$scrambled_id)), "\n")
  cat("Mapping table:", nrow(mapping_table), "mappings\n")

  return(list(
    data = df_final,
    mapping = mapping_table
  ))
}

# Run the working version
result <- scramble_large_dataset_working(data, seed = 42)
df_scrambled <- result$data
mapping_table <- result$mapping

write.csv(mapping_table, "Data/MappingTable.csv")

# Rename columns back to original values

df_scrambled <- rename(df_scrambled, ID = scrambled_id,
               Name = scrambled_name,
               Email = scrambled_email)

write_rds(df_scrambled, "Data/October/Week 2/New.rds")