This repository was archived by the owner on Sep 26, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathks_normality.R
More file actions
118 lines (96 loc) · 3.43 KB
/
ks_normality.R
File metadata and controls
118 lines (96 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/Rscript --slave
#####
# how to use
#####
usage <- "usage: Rscript ks_normality.R <file-with-input-data.csv>"
###
#
# TOOO: add p value in graph legend
###
### handle command line arguments (i.e. name of data file)
args <- commandArgs(TRUE)
inputfile <- args[1]
# open file, rad.csv expects fields separated by komma (",") and the dot (".") as decimal separator (US)
# open file, rad.csv2 expects fields separated by semikolon (";") and the komma (",") as decimal separator (European)
#incomingData <- read.csv2(inputfile, header=TRUE, check.names=FALSE)
incomingData <- read.csv2(inputfile, header=TRUE)
# give names of headers
headers <- names(incomingData)
# attach datafile so we can refer to variable names directly
# don't forget to use detach at the end
attach(incomingData)
# use plyr for colwise test
#install.packages("plyr")
library("plyr")
### TEST functions ###
### 1.) Kolmogorov-Smirnov-Test
### input: x = data, y: pnorm for normal distribution, plnorm for lognormal distribution
ks_test <- function(x,y="pnorm") {
mu <- mean(x)
sigma <- sd(x)
foo <- ks.test(x=x, y=y, mean = mu, sd = sigma)
sim <- foo$statistic
return(sim)
}
shapiro_test <- function(x) {
shapiro <- shapiro.test(x)
}
### END TEST functions ###
# Apply to every column in a data frame
normals <- numcolwise(shapiro_test)(incomingData)
# use psych for calculation of SD and MEAN
library("psych")
# use ggplot2 for printing plots
library("ggplot2")
# for loops through all columns
nvar <- length(headers)
### summary gives descriptive stats for each column
# summary(incomingData)
for(i in 1:(nvar)) {
nam <- headers[i]
### calculate mean and sd for normal curve
mean <- summary(incomingData[[nam]])[4]
sd <- SD(incomingData[[nam]])
### calculate min and max as well as number of valid data points for binwidth
min <- summary(incomingData[[nam]])[1]
max <- summary(incomingData[[nam]])[6]
num_nas <- summary(incomingData[[nam]])[7]
if(is.na(num_nas)) {
num_nas <- 0
}
valid_datapoints <- length(incomingData[[nam]]) - num_nas
binwidth <- (max - min) / valid_datapoints
cat("nam: ", nam, "max: ",max,"\n min: ",min,"\n valid samples: ",valid_datapoints,"\n binwidth: ", binwidth, "\n\n")
if(is.na(binwidth)) {
cat("COUGHT NA FOR VARIABLE ",nam,".. will SKIP processing this!!\n\n")
next
}
filename <- paste0("sig_graph/",nam,"_dichte_vs_normalverteilung.png")
ggplot(incomingData, aes_string(x=nam)) +
geom_histogram(aes(y = ..density..), alpha=0.2, binwidth=binwidth, fill="red",colour="black") +
stat_function(fun=dnorm, args=list(mean=mean, sd=sd), colour="green", label="Normalverteilung")
# stat_function(fun=dlnorm, args=list(mean=mean, sd=sd), colour="green", label="Lognormale Verteilung") +
theme(legend.position = "bottom", legend.direction = "horizontal")
ggsave(file=filename)
qqfilename <- paste0("sig_graph/",nam,"qq-plot.png")
png(qqfilename)
qqnorm(incomingData[[nam]],
main=paste0("Normalverteilung Q-Q Plot für ", nam),
xlab=paste0("theoretische Quantilen von ", nam),
ylab=paste0("gemessene Quantilen von ", nam))
qqline(incomingData[[nam]])
dev.off()
}
warnings()
# form output file names
csvnorm <- paste0('shapiro_normality_',inputfile)
# write to files
write.csv2(normals, file = csvnorm)
### output to LateX
# install.packages("xtable")
library("xtable")
latex <- xtable(normals)
# latex <- xtable(is_uniform)
print(latex,file=paste0(csvnorm,".tex"))
### clean up
detach(incomingData)