-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAuxiliar_functions.R
More file actions
134 lines (109 loc) · 5.51 KB
/
Auxiliar_functions.R
File metadata and controls
134 lines (109 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#Creation of function that finds objects, like Uniprot Identifiers,
#that have a desired correspondence (objs)
#from (a) certain column(s) of a data frame (cln), and returns a list
#of all the objects found (union)
fd_objects_frm_cln_union <- function(df, cln = c(search_, get_), objs = c(), vice_versa = TRUE){
#ARGUMENTS:
#df- dataframe/data; cln - colnumn names (str/char format) of df,
#objs - vector that include the itemns to find,
#vice_versa can be TRUE or FALSE, if TRUE (default), the function will
#search and get items from both columns indicated in cln
list_of_lines <- c()
list_of_lines_ <- c()
#NOTE: cln is a vetctor of paired columns, one to search information, and another to get information from
for (o in objs){
if (vice_versa == TRUE){
list_of_lines <- append(list_of_lines, which(df[,cln[1]]==o))
list_of_lines_ <- append(list_of_lines_, which(df[,cln[2]]==o))
}
else{
list_of_lines <- append(list_of_lines, which(df[,cln[1]]==o))
}
}
if (is.null(list_of_lines_) == FALSE){
vice <- unique(df[list_of_lines , cln[2]])
versa <- unique(df[list_of_lines_, cln[1]])
#organizes vice and versa versions of cln in separate parts of a list called results
results <- list("vice" = vice,"versa" = versa)
}
else {
results <- unique(df[list_of_lines , cln[2]])
}
return(results)
}
#Function specialized in finding gene.id's from a given sample in a data frame
#It is a simplified version of the previous function and more directed for a specific problem
index_gene_help <- function(df, sample=intersection){
#ARGUMENTS:
#df - data frame, sample - normally a vector of characters obtained by intersection of other data
linhas = c()
for (i in 1:length(df$Gene.ID)){
if (df$Gene.ID[i] %in% sample){
linhas <- append(linhas,i)
}
}
return(linhas)
}
#Program useful to run multiple correlation tests between a main data, in this case,
#RNAseq data for HeLa cell line, and various variables present in a data frame, corresponding here to
#human tissues from GTEx data, all acquired from Atlas Expression
Multiple_cor_tests <- function(test_table, base_table = hela_cr, xynames = c("Correlation coefficient", "Studied Tissues")){
#ARGUMENTS:
#test_table - table that contain multiple variables to test with base_table
#base_table - hela_cr by default, xynames - vector that includes the names of x and y plot's axis labels, in this order
#(default values for the purpose of this whole program)
names <- colnames(test_table)
df <- data.frame()
not_possible <- c()
for (i in 3:length(names)){
#in expression table, the first 2 columns concern gene id and gene name, that's why we start at 3
corrected = c()
for (l in 1:length(test_table[,i])){
#we establish TPM inferior to 1.0 as not significant expression data for this test
if (is.na(test_table[l,i])==FALSE && test_table[l,i]>1) {
#the following character vector includes every ENSEMBL identifier which TPM followed the previous conditions
corrected <- append(corrected,test_table[l,1])
}
}
if (length(corrected)<10){
#we establish that if the dimension of the data for the test is inferior to 10, the data from the selected
#tissue cannot be used by this function, in other words, it cannot calculate the correlation coefficient
not_possible <- append(not_possible, names(i))
}
else{
#acquisition of expression data for each test_table's column and the base's only column of interest
sample_base <- base_table[index_gene_help(base_table, sample = corrected),3]
sample_table_i <- test_table[index_gene_help(test_table, sample = corrected), i]
#correlation test itself
test <- cor.test(sample_base, sample_table_i, method="spearman", exact= FALSE)
test1 <- cor.test(sample_base, sample_table_i, method="pearson", exact= FALSE)
#normality test
test2 <- shapiro.test(sample_table_i)
#organization of information in the data frame, df, initially empty, in order to visualize it in the end
df[i-2, 1] <- names[i]
df[i-2, 2] <- test$estimate
df[i-2, 3] <- test$p.value
df[i-2, 4] <- test1$estimate
df[i-2, 5] <- test2$p.value
df[i-2, 6] <- length(corrected)
df[i-2, 7] <- test2$p.value
}
}
if (length(not_possible)!= 0){
print("The following tissues were not elegible for correlation test with the data for the cancerous cell line HeLa:", not_possible)
}
colnames(df) <- c("Tissues_to_compare","Spearman_Cor_coef","S_p_value", "Kendall_Cor_coef","K_p_value","Dimension", "SW Normality_test")
df <- df[order(df$Spearman_Cor_coef),]
x_name <- xynames[1]
y_name <- xynames[2]
#Function explained in ggplot 2 forums and main web page
plot <- ggplot(df, aes(x = Spearman_Cor_coef, y=reorder(Tissues_to_compare, + Spearman_Cor_coef), fill = S_p_value)) +
labs(x = x_name, y = y_name, fill = "p value")+
scale_fill_gradientn(trans = "log",breaks=c(0.1,0.05,0.01, 0.001,0.0001), colors = colorspace::diverge_hcl(7))+
geom_bar(stat="identity",color="black")+
guides(fill = guide_colourbar(barwidth = 0.5, barheight = 10))+
theme_classic()
#organization of the plot and data frame in a list, giving the possibility to see both as a return for this function
results <- list("plot"=plot, "dataframe" = df)
return(results)
}