-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_cleaning_functions.R
More file actions
174 lines (162 loc) · 9.88 KB
/
data_cleaning_functions.R
File metadata and controls
174 lines (162 loc) · 9.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
calc_max_kw_per_site <- function(performance_data){
performance_data <- group_by(performance_data, ts, site_id)
performance_data <- summarise(performance_data , power_kW=sum(power_kW))
performance_data <- as.data.frame(performance_data)
performance_data <- group_by(performance_data, site_id)
site_max_power <- summarise(performance_data , max_power_kW=max(power_kW))
site_max_power <- as.data.frame(site_max_power)
site_max_power <- site_max_power[,c("site_id", "max_power_kW")]
return(site_max_power)
}
group_site_details_to_one_row_per_site <- function(site_details){
site_details <- site_details %>% group_by(site_id)
processed_site_details <- site_details %>%
summarise(s_state=first(s_state),
pv_installation_year_month=first(pv_installation_year_month),
#sum_ac=sum(ac), sum_dc=sum(dc),
#NS note: the comment in the line below was not me. I also did not comment out the sum_ac in the line above.
##CHANGES MADE AS TOTAL SITE AC SEEMS TO BE THE TOTAL SITE VALUES AS THEY ARE ALL THE SAME REGARDLESS OF INVERTER
sum_ac=mean(ac), sum_dc=mean(dc),
sum_ac_old=sum(ac_old), sum_dc_old=sum(dc_old),
ac_dc_ratio=mean(ac_dc_ratio),
manufacturer=paste(manufacturer, collapse=' '),
model=paste(model, collapse=' '), s_postcode=first(s_postcode),
rows_grouped=length(ac))
processed_site_details <- as.data.frame(processed_site_details)
return(processed_site_details)
}
site_details_data_cleaning <- function(performance_data, site_details){
# Record old ac and dc values.
site_details <- mutate(site_details, ac_old=ac)
site_details <- mutate(site_details, dc_old=dc)
site_details <- check_for_ac_value_in_watts(site_details)
# Find peak power per site for more checks
site_details <- group_site_details_to_one_row_per_site(site_details)
max_site_power <- calc_max_kw_per_site(performance_data)
site_details <- left_join(site_details, max_site_power, on=c("site_id"))
site_details <- check_for_peak_power_greater_than_dc_capacity(site_details)
# Record if dc value was scaled or not.
site_details <- mutate(site_details, change_dc=ifelse(sum_dc!=sum_dc_old,1,0))
site_details <- check_ac_capacity_less_than_peak_power(site_details)
# record if ac acpacity value was changed.
site_details <- mutate(site_details, change_ac=ifelse(sum_ac!=sum_ac_old,1,0))
site_details <- setnames(site_details, c("sum_ac", "sum_dc", "sum_ac_old", "sum_dc_old"),
c("ac", "dc", "ac_old", "dc_old"))
return(site_details)
}
check_for_peak_power_greater_than_dc_capacity <- function(site_details){
# If dc capacity value doesn't make sense given peak power value then scale
# dc capacity up, based on nearest interger scaling value that makes dc
# capacity higher than peak power.
site_details <- mutate( site_details,
sum_dc=ifelse((sum_dc/1000)/abs(max_power_kW) < 0.9,
ceiling(max_power_kW/(sum_dc/1000)) * sum_dc, sum_dc))
return(site_details)
}
check_for_ac_value_in_watts <- function(site_details){
# Calculate helper value
site_details <- mutate(site_details, ac_dc_ratio=ac/dc)
# Check if ac value is in watts, if so devide by 1000.
site_details <- mutate(site_details, ac=ifelse(ac_dc_ratio > 0.1 & ac > 150, ac/1000, ac))
return(site_details)
}
check_ac_capacity_less_than_peak_power <- function(site_details){
# If ac value needs scaling to based on ratio to peak power and dc has not
# been scaled, then scale to meet dc capacity, if dc was also scaled then
# scale to meet peak power value.
site_details <- mutate(site_details,
sum_ac=ifelse(sum_ac/abs(max_power_kW) < 0.6,
ifelse(change_dc==0,ceiling((sum_dc/1000)/sum_ac) * sum_ac,
ceiling(max_power_kW/sum_ac) * sum_ac),
sum_ac))
return(site_details)
}
clean_connection_types <- function(combined_data, circuit_details, postcode_data){
postcode_data <- calc_sunrise_sunset_bounds(postcode_data, as.Date(combined_data$ts[1]))
# Merge sunrise and sunset times onto the timeseries data frame.
combined_data <- left_join(combined_data, select(postcode_data, postcode, sunrise, sunset, dis_sunrise, dis_sunset),
by=c("s_postcode" = "postcode"))
# Calculate the values needed to determine if a connection type is correct
combined_data <- clac_output_summary_values(combined_data)
# Record details before changes
combined_data <- mutate(combined_data, old_con_type=con_type)
combined_data <- mutate(combined_data, old_polarity=polarity)
combined_data <- check_day_vs_night_energy(combined_data)
combined_data <- check_for_reversed_polarity(combined_data)
combined_data <- check_for_mixed_polarity(combined_data)
# Flag systems with changed connection type or polarity.
combined_data <- mutate(combined_data, con_type_changed=ifelse(con_type!=old_con_type,1,0))
combined_data <- mutate(combined_data, polarity_changed=ifelse(polarity!=old_polarity,1,0))
# Remove first_ac column
combined_data <- combined_data[ , -which(names(combined_data) %in% c("first_ac"))]
# Select the values from the orginal circuit details that would not be changed
# by cleaning, then merge back in with details updated or created by cleaning
circuit_details <- select(circuit_details, site_id, c_id)
combined_data <- left_join(combined_data, circuit_details, on="c_id")
return(combined_data)
}
calc_sunrise_sunset_bounds <- function(postcode_data, event_date){
# Need date of event to calaculate sunrise and sunset times.
postcode_data$date <- event_date
# Find sunrise and sunset times on a postcode basis.
postcode_data <- mutate(postcode_data, sunrise=getSunlightTimes(data=postcode_data, tz="Australia/Brisbane",
keep=c('sunrise'))$sunrise)
postcode_data <- mutate(postcode_data, sunset=getSunlightTimes(data=postcode_data, tz="Australia/Brisbane",
keep=c('sunset'))$sunset)
# Create 1 hour buffer either side of sunrise and sunset to allow for large
# postcodes, as lat and lon is the postcode centre.
postcode_data <- mutate(postcode_data, sunrise=sunrise-60*60)
postcode_data <- mutate(postcode_data, sunset=sunset+60*60)
# Format sunrise and sunset as character so it is displayed in brisbane time in gui.
postcode_data <- mutate(postcode_data, dis_sunrise=as.character(sunrise))
postcode_data <- mutate(postcode_data, dis_sunset=as.character(sunset))
# Export post_code_data including sun rise and sun set times
path_out = "F:/05_Solar_Analytics/2019-07-23_dtd_v_curtail_24days/"
write.csv(postcode_data, paste(path_out,'post_codes_data.csv',sep = ''), row.names=FALSE)
return(postcode_data)
}
clac_output_summary_values <- function(combined_data){
# Determine if a data point is during daylight hours or not.
combined_data <- mutate(combined_data, comp_t=as.POSIXct(strftime(ts, format="%H:%M:%S"),format="%H:%M:%S"))
combined_data <- mutate(combined_data, sunrise=as.POSIXct(strftime(sunrise, format="%H:%M:%S"),format="%H:%M:%S"))
combined_data <- mutate(combined_data, sunset=as.POSIXct(strftime(sunset, format="%H:%M:%S"),format="%H:%M:%S"))
combined_data <- mutate(combined_data, daylight=ifelse(comp_t>sunrise & comp_t<sunset,1,0))
# Group data by c_id, calculating values needed to perform data cleaning
combined_data <- group_by(combined_data, c_id)
combined_data <- summarise(combined_data, energy_day=round(sum(abs(e[daylight==1]))/1000/(60*60),digits=2),
energy_night= round(sum(abs(e[daylight!=1]))/1000/(60*60), digits=2),
con_type=first(con_type), sunrise=first(dis_sunrise),
sunset=first(dis_sunset), first_ac=first(first_ac),
min_power=min(power_kW), max_power=max(power_kW), polarity=first(polarity))
combined_data <- as.data.frame(combined_data)
# Calculate the fraction of gen/load that occured during daylight hours, just
# use absolute value of power as polarity has not been checked yet.
combined_data <- mutate(combined_data, frac_day=round(energy_day/(energy_day+energy_night), digits=2))
}
check_day_vs_night_energy <- function(combined_data){
# Check for pv connection type, if the type is pv but more than 10% of the
# gen/load was outside daylight hours then change to type 'load'. Only change
# if system has operated at above an avergae of 1% capacity
combined_data <- mutate(combined_data,
con_type=ifelse(frac_day<0.90 & con_type %in% c("pv_site_net", "pv_site", "pv_inverter_net") &
(energy_day+energy_night) > first_ac * 24 * 0.01, "load", con_type))
return(combined_data)
}
check_for_reversed_polarity <- function(combined_data){
# Check for peak power occuring as a negative value i.e. reversed polarity,
# if this occurs for pv systems that have operated at above an avergae of 0.5% NS NOTE - previously 1%
# capacity then swap polarity.
combined_data <- mutate(
combined_data,
polarity=ifelse(abs(min_power) > abs(max_power) & con_type %in% c("pv_site_net", "pv_site", "pv_inverter_net") &
(energy_day+energy_day) > first_ac * 24 * 0.005 & min_power < 0.0, polarity * -1, polarity))
return(combined_data)
}
check_for_mixed_polarity <- function(combined_data){
# Check for power flowing in both negative and positive direction, if this
# occurs with values large than 10 % of the inverter capacity then change
# connection type to 'mixed'
combined_data <- mutate(combined_data, con_type=ifelse(max_power > first_ac * .1 & min_power * -1 > first_ac * 0.1,
"mixed", con_type))
return(combined_data)
}