ORA-Pipeline-Scripts/geneious_parser.py at main · yoheLab/ORA-Pipeline-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import os

def main():
	#define needed lists
	types = []
	counts = []
	typesWCS = []
	countsWCS = []
	filenames = []
	#get folder name
	while True:
		try:
			print("Enter folder name: ")
			foldername = input()
			if foldername not in os.listdir():
				raise ValueError
		except ValueError:
			print("Folder not found.")
		except FileNotFoundError:
			print("Folder not found.")
		else:
			break
	#create foldername_parsed
	foldername_parsed = foldername + "_parsed"
	#check if already folder exsists, make it if it doesn't
	if foldername_parsed not in os.listdir():
		os.mkdir(foldername_parsed)
	#gather all the filenames from the data folder into a list of filenames
	#this has to be done becasue the os.listdir() function also changes the working directory
	#this change causes conflicts with the file navigation commands in the headerParser function
	for filename in os.listdir(foldername):
		filenames.append(filename)
	#send each filename from the list to the headerParser function
	#the function does its own file navigation, so no navigation is needed in hte main function
	#the headerParser function also returns the name of the newly made file so it can be updated in the filename list
	for i in range(0, len(filenames)):
		filenames[i] = headerParser(filenames[i], foldername, foldername_parsed)
	#run geneCounter on all files in foldername_parsed
	#values returned by geenCounter are put into temporary holding variable before being added to lists
	for i in range(0, len(filenames)):
		types_Temp, counts_Temp, typesWCS_Temp, countsWCS_Temp = geneCounter(filenames[i], foldername_parsed)
		types.append(types_Temp)
		counts.append(counts_Temp)
		typesWCS.append(typesWCS_Temp)
		countsWCS.append(countsWCS_Temp)
	#merge types and typesWCS into single lists for use as headers when writing the table
	headers, headerWCS = listMerger(types, typesWCS)
	#send all lists to csvWriter so tables can be generated
	csvWriter(types, counts, typesWCS, countsWCS, headers, headerWCS, filenames, foldername)
	#make geneious file structure
	geneiousFileMaker(types,foldername, filenames)
	#must be given specific itme from the lists in a loop--------------------------------
	for filename in filenames:
		geneiousFilePopulator(geneiousSequenceParser(filename, foldername_parsed, foldername), filename, foldername)

def headerParser(filename, foldername, foldername_parsed):
	#define neccesary lists
	onlyHeaders = []
	#nagivate to the proper folder
	os.chdir(foldername)
	#extract out all the headers from the fasta file and put them in onlyHeaders list
	with open(filename) as fh:
		for line in fh:
			if line.startswith(">"):
				onlyHeaders.append(line)
	#navigate out of data folder and back into starting folder
	os.chdir(os.path.dirname(os.getcwd()))
	#navigate into parsedData folder to deposit the new file
	os.chdir(foldername_parsed)
	#extract species name from filename, all file names have either "GCF" or "GCA" after the species name
	#this is done after the filename varialbe has been used to referecen the file, so it can be chagned now
	#filename = filename.split("GC") <------------ This only works for reference genomes
	filename = filename.split("genomic") # <------ This works for non-referecne genomes
	filename = filename[0].strip("_")
	filename += ".txt"
	#create new empty file using species name extracted from filename
	newFile = open(filename, "w")
	#write each header to new empty file
	for header in onlyHeaders:
		newFile.write(header)
	#close new file
	newFile.close()
	#navigate back to starting folder
	os.chdir(os.path.dirname(os.getcwd()))
	#return modified filename so it can be used in other function to refer to the newly made file
	return(filename)

def geneCounter(filename, foldername_parsed):
	#define required lists
	OR_list = []
	types_list = []
	types_list_with_coding_status = []
	counts_list = []
	counts_list_with_coding_status = []
	#navigate to parsedData folder
	os.chdir(foldername_parsed)
	#open file and extract gene names and coding status to OR_list
	with open(filename)as fh:
		for line in fh:
			line = line.split("|")
			if len(line) == 2:
				OR_list.append(tuple([line[1].strip("\n"), "CODING"]))
			elif len(line) == 3:
				OR_list.append(tuple([line[1].strip("\n"), line[2].strip("\n")]))


	#identify all unique gene type and coding status pairs in OR_list and add them to types_list_with_coding_status
	#include both gene type and coding status in types_list_with_coding_status
	for OR in OR_list:
		if OR in types_list_with_coding_status:
			pass
		else:
			types_list_with_coding_status.append(OR)
	#identify all unique gene types in OR_list and add them to types_list
	#having both types_list_with_coding_status and types_list allows for multiple types of table to be made later
	for OR in OR_list:
		if OR[0] in types_list:
			pass
		else:
			types_list.append(OR[0])

	#add a zero value entry to counts list for each unique gene type in types_list
	for Type in types_list:
		counts_list.append(0)
	#does the same as above but for counts_list_with_coding_status and types_list_with_coding_status
	for Type in types_list_with_coding_status:
		counts_list_with_coding_status.append(0)
	#tally up the total number of each unique gene type in OR_list, tallies as put int counts_list
	for i in range(0, len(types_list)):
		for OR in OR_list:
			if OR[0] == types_list[i]:
				counts_list[i] += 1
	#does the same as above but for counts_list_with_coding_status and types_list_with_coding_status
	for i in range(0, len(types_list_with_coding_status)):
		for OR in OR_list:
			if OR == types_list_with_coding_status[i]:
				counts_list_with_coding_status[i] += 1
	#navigate back to starting folder
	os.chdir(os.path.dirname(os.getcwd()))
	#return both sets of type_lists
	return(types_list, counts_list, types_list_with_coding_status, counts_list_with_coding_status)

def listMerger(types, typesWCS):
	#define list to hold both the full header list and the full header list w/ coding status
	header = []
	headerWCS = []
	#run through each list in types and see if it needs to be added to header
	for i in range(0, len(types)):
		for j in range(0, len(types[i])):
			if types[i][j] in header:
				pass
			else:
				header.append(types[i][j])
	#run through each list in typesWCS and see if it needs to be added to headerWCS
	for i in range(0, len(typesWCS)):
		for j in range(0, len(typesWCS[i])):
			if typesWCS[i][j] in headerWCS:
				pass
			else:
				headerWCS.append(typesWCS[i][j])
	#return both full lists
	return(header, headerWCS)

def csvWriter(types, counts, typesWCS, countsWCS, headers, headerWCS, filenames, foldername):
	#create empty file to print data w/o coding status
	newFile = open(foldername + ".txt", "w")
	#write headers to top of file, check for last header so that no comma is written
	newFile.write("Organism Name" + ", ")
	for header in headers:
		if headers.index(header) == len(headers) - 1:
			newFile.write(header)
		else:
			newFile.write(header + ", ")
	#end header line
	newFile.write("\n")
	#write mammal name excluding .txt
	#check if an OR type is in each type list, write the corresponding count if it is, otherwise write a 0
	#excludes commas from the ends of each row
	for i in range(0, len(filenames)):
		newFile.write(filenames[i].strip(".txt") + ", ")
		for header in headers:
			if headers.index(header) == len(headers) - 1:
				if header in types[i]:
					newFile.write(str(counts[i][types[i].index(header)]))
				else:
					newFile.write("0")
			else:
				if header in types[i]:
					newFile.write(str(counts[i][types[i].index(header)]) + ", ")
				else:
					newFile.write("0 ,")
		newFile.write("\n")
	#close new file
	newFile.close()

	#create empty file to print data with coding status CODING
	newFile = open(foldername + "_coding_status.txt", "w")
	#write headers to top of file, check for last header so that no comma is written
	newFile.write("Organism Name" + ", ")
	#write OR type and coding status separated by a hyphen
	for header in headerWCS:
		if header[1] == "CODING":#<----------------------------------------
			if headerWCS.index(header) == len(headerWCS) - 1:
				newFile.write(header[0] + "-" + header[1])
			else:
				newFile.write(header[0] + "-"+ header[1] + ", ")
	#end header line
	newFile.write("\n")
	#write mammal name excluding .txt
	#check if an OR type with coding status is in each type list, write the corresponding count if it is, otherwise write a 0
	#excludes commas from the ends of each row
	for i in range(0, len(filenames)):
		newFile.write(filenames[i].strip(".txt") + ", ")
		for header in headerWCS:
			if header[1] == "CODING": #<----------------------------------------
				if headerWCS.index(header) == len(headerWCS) - 1:
					if header in typesWCS[i]:
						newFile.write(str(countsWCS[i][typesWCS[i].index(header)]))
					else:
						newFile.write("0")
				else:
					if header in typesWCS[i]:
						newFile.write(str(countsWCS[i][typesWCS[i].index(header)]) + ", ")
					else:
						newFile.write("0 ,")
		newFile.write("\n")
	#close new file
	newFile.close()

	#create empty file to print data with coding status PSUEDOGENE
	newFile = open(foldername + "_coding_status_psuedo.txt", "w")
	#write headers to top of file, check for last header so that no comma is written
	newFile.write("Organism Name" + ", ")
	#write OR type and coding status separated by a hyphen
	for header in headerWCS:
		if header[1] == "PSEUDOGENE":#<----------------------------------------
			if headerWCS.index(header) == len(headerWCS) - 1:
				newFile.write(header[0] + "-" + header[1])
			else:
				newFile.write(header[0] + "-"+ header[1] + ", ")
	#end header line
	newFile.write("\n")
	#write mammal name excluding .txt
	#check if an OR type with coding status is in each type list, write the corresponding count if it is, otherwise write a 0
	#excludes commas from the ends of each row
	for i in range(0, len(filenames)):
		newFile.write(filenames[i].strip(".txt") + ", ")
		for header in headerWCS:
			if header[1] == "PSEUDOGENE": #<----------------------------------------
				if headerWCS.index(header) == len(headerWCS) - 1:
					if header in typesWCS[i]:
						newFile.write(str(countsWCS[i][typesWCS[i].index(header)]))
					else:
						newFile.write("0")
				else:
					if header in typesWCS[i]:
						newFile.write(str(countsWCS[i][typesWCS[i].index(header)]) + ", ")
					else:
						newFile.write("0 ,")
		newFile.write("\n")
	#close new file
	newFile.close()

def geneiousFileMaker(types, foldername, filenames):
#make head folder name
	headFolder = foldername + "_geneious"
#make head folder for geneious file structure
	if headFolder not in os.listdir():
		os.mkdir(headFolder)
#move into head folder
	os.chdir(headFolder)
#make a folder for each gene
	for i in range(0, len(filenames)):
		if filenames[i].strip(".txt") not in os.listdir():
			os.mkdir(filenames[i].strip(".txt"))
		os.chdir(filenames[i].strip(".txt"))
		for OR in types[i]:
			if OR not in os.listdir():
				os.mkdir(OR)
				os.chdir(OR)
				os.mkdir(OR + "_CODING")
				os.mkdir(OR + "_PSEUDOGENE")
				os.chdir(os.path.dirname(os.getcwd()))
		os.chdir(os.path.dirname(os.getcwd()))
		os.chdir(os.path.dirname(os.getcwd()))
		os.chdir(headFolder)
#return to starting directory
	os.chdir(os.path.dirname(os.getcwd()))

def geneiousSequenceParser(filename, foldername_parsed, foldername):
	#define needed lists
	OR_list = []
	codingStatus = []
	headerList = []
	seqList = []
	masterList = []
	#move into original data folder
	os.chdir(foldername)
	#gather the contents of each fasta file into a list of tuples
	for file in os.listdir():
		if filename.strip(".txt") in file:
			with open(file) as fh:
				for line in fh:
					if line.startswith(">") == True:
	#add header to header list
						headerList.append(line.strip("\n"))
	#parse out OR type to put in tuple list later
						OR = line.split("|")
	#determine if gene is coding or pseudo
						if len(OR) == 2:
							OR_list.append(OR[1].strip("\n"))
							codingStatus.append("CODING")
						elif len(OR) == 3:
							OR_list.append(OR[1].strip("\n"))
							codingStatus.append(OR[2].strip("\n"))
	#define temp string to hold growing sequences
				temp = ""
	#run through file again to parse out sequences
			with open(file) as fh:
				for line in fh:
					if line.startswith(">") == True:
						seqList.append(temp)
						temp = ""
					else:
						temp += line.strip("\n")
	#add in last sequence as there are no more > symbols
				seqList.append(temp)
	#remove first empty itme from seqList
				del(seqList[0])

	#combine lists into masterList
	for i in range(0, len(OR_list)):
		masterList.append(tuple([OR_list[i], codingStatus[i], headerList[i], seqList[i]]))
	#send masterList from a single animal file back to main so its data can be written to the proper folder
	os.chdir(os.path.dirname(os.getcwd()))
	return(masterList)

def geneiousFilePopulator(masterList, filename, foldername):
	#navigate to proper folder
	os.chdir(foldername + "_geneious")
	os.chdir(filename.strip(".txt"))
	#run through masterList and distribute files into proper folders
	for tup in masterList:
	#move into correct OR type folder
		os.chdir(tup[0])
	#move into correct coding status folder
		os.chdir(tup[0] + "_" + tup[1])
	#write new file
		newFile = open(tup[2], "w")
		newFile.write(tup[2] + "\n")
		newFile.write(tup[3] + "\n")
	#return to head file
		os.chdir(os.path.dirname(os.getcwd()))
		os.chdir(os.path.dirname(os.getcwd()))
	#return to original directory
	os.chdir(os.path.dirname(os.getcwd()))
	os.chdir(os.path.dirname(os.getcwd()))
	print(filename.strip(".txt") + " done.")

if __name__ == '__main__':
	main()