Automation_scripts/mt_mapped_coverage.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# This script gets the average coverage of the mitochondrial genome for samples
# processed mapped with CLC. The data originates by doing the following in CLC:
# Tools>Quality Control>QC for Read Mapping. Ensure that "Create
# Separate table with statistics for each mapping" in the "Output Options" is
# checked off. This option generates these reports, which are then exported as
# CSV and processed by this script as follows below.

# To start, let's import the required libraries.
import os
import pandas as pd
import numpy as np

# Define path and files
path = os.getcwd()
files = os.listdir(path)
csv = [f for f in files if f[-14:] ==  'statistics.csv']

# Create an empty list to concatenate the data to
df = []

# Add a column "Name" to each csv file, and populate it down with the
# basename
for f in csv:
    c = pd.read_csv(f)
    if 'Name' in c.columns:
        continue
    else:
        a = c.insert(0, "Name",(os.path.basename(f)))

        # The 'Name' will have everything after '_' stripped and only the
        # zeroth string kept. The individual dataframes are appended to the
        # list.
        c['Name'] = c['Name'].str.split('_').str[0]
        df.append(c)

# Concatenate the list created above into a dataframe.
df = pd.concat(df, axis=0,ignore_index=True)

# We are going to modify the dataframe so that only the columns of interest are
# shown. This will be put into a new dataframe called 'df1'. First, we are
# going to only retain the 'Name', Reference name', and 'Average coverage'
# columns. Since this is based on samples already mapped to the mitochondria,
# it is unnecesarry to restrict the 'Reference name' column as done in the
# whole coverage version of this script.
df1 = df[['Name',  'Reference name', 'Average coverage']]

# Separate out synaptosomes from homogenates and place them into their own
# dataframes - Syn and WBH respectively. Remove '-Syn' and -'WBH' from the
# 'Name' column.
Syn = df1[df1['Name'].str.contains('Syn')]
Syn = Syn.replace({'-Syn':''}, regex=True)
WBH = df1[df1['Name'].str.contains('WBH')]
WBH = WBH.replace({'-WBH':''}, regex=True)

# Import the synaptosome (df31.xlsx) and homogenate (df41.xlsx) (generated from
# the 'Concaten_clc_data.py' script) as dataframe(df31 and df41 respectively).
df31 = pd.read_excel('df31.xlsx')
df41 = pd.read_excel('df41.xlsx')

# We will now drop any samples from the Syn and WBH dataframes that are not in
# the df31 and df41 dataframes respectively. This will give us the coverage for
# the samples actually processed. This will be done by right merging 'Syn' and
# 'df31', and 'WBH' and 'df41'. Then only keeping the original Syn and WBH
# columns and dropping duplicates.
Syn = pd.merge(Syn, df31, on='Name', how='right')
Syn = Syn[['Name', 'Reference name', 'Average coverage']].drop_duplicates()
WBH = pd.merge(WBH, df41, on='Name', how='right')
WBH = WBH[['Name', 'Reference name', 'Average coverage']].drop_duplicates()
Syn.to_excel('Syn_mt_mapped.xlsx', index=False)
WBH.to_excel('WBH_mt_mapped.xlsx', index=False)