-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathRefseq_info_extract_script.sh
More file actions
44 lines (30 loc) · 1.52 KB
/
Refseq_info_extract_script.sh
File metadata and controls
44 lines (30 loc) · 1.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#SBATCH --partition=DTN
#SBATCH --time=5:00:00
module load anaconda3
module load samtools
conda activate conda_environment
#Goes through the bam files and prints a count of the refseq alignments and accession ID to a "quality file"
for content in *.bam;
do
samtools view -q 30 $content | awk '{print $3}' | uniq -c | awk '{print $1", "$2}' |sort -n >> $content-quality.txt
done
#loop Goes through the sequence IDs in the second column of every Refseq quality file and loops through them into a command
#that extracts the genbank file of the organism and gather the organism and taxonomy info
#appends the organism name and taxa info of each organism in every line of a txt file
#Then the Refseq quality files of the two columns are joined together by the Refseq taxa info files into one file
for file in *quality.txt;
do
seqs=$(awk '{print $2}' $file)
for ID in $seqs;
do
organism=$(efetch -db nuccore -id $ID -format gb | grep 'ORGANISM')
taxa=$(efetch -db nuccore -id $ID -format gb | grep -A 2 'Eukaryota' | tr -d '\n' | sed 's/;/; /g')
echo $organism $file $taxa >> $file.taxon.txt
done
paste $file $file.taxon.txt >> $file-profile.txt
done
#after loop is done, a python script is called that edits the files into csv format
#The final command adds a header to the csv file of all content in the csv file
python3 Refseq_edit_file.py >> file_name.csv
sed -i '1i Read_counts,Sequence_ID,Microbe_organism,Animal_species,Domain,Kingdom,Subkingdom,Phylum,Subpyhlum,Class,Subclass,Order,Family,Genus,Species' file_name.csv