invitroMuscleDystrophyAnchoredCellSheet/code3_upset_plot.py at main · Evolved-Bio/invitroMuscleDystrophyAnchoredCellSheet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
!pip install upsetplot

import pandas as pd
import numpy as np
from google.colab import files
import re
from sklearn.preprocessing import QuantileTransformer
from upsetplot import UpSet, from_contents
import matplotlib.pyplot as plt

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Read the CSV file into a DataFrame
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename, index_col=0)

# Step 3: Log10 transformation to stabilize variance
df = np.log10(df + 1)  # Adding 1 to avoid log(0) issues

# Step 4: Quantile normalization to make distributions comparable
scaler = QuantileTransformer(output_distribution='normal', random_state=0)
df_normalized = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)

# Step 5: Calculate the mean values for columns with the same prefix (handling replicates)
column_groups = {}
for col in df_normalized.columns:
    prefix = re.split(r'-\d+', col)[0]  # Extract prefix before "-"
    if prefix in column_groups:
        column_groups[prefix].append(col)
    else:
        column_groups[prefix] = [col]

# Calculate mean for each group
mean_df = pd.DataFrame()
for prefix, columns in column_groups.items():
    mean_df[prefix] = df_normalized[columns].mean(axis=1)

# Step 6: Define control condition and calculate Z-scores for other conditions
control_condition = '3D,HC'
control_mean = mean_df[control_condition].mean()
control_std = mean_df[control_condition].std()

# Step 7: Categorize proteins based on Z-scores and prepare data for UpSet plot
upregulated_data = {}
downregulated_data = {}
significant_threshold = 1  # Z-score threshold for significance

for column in mean_df.columns:
    if column != control_condition:
        z_scores = (mean_df[column] - control_mean) / control_std
        upregulated_data[column] = set(mean_df.index[z_scores > significant_threshold])
        downregulated_data[column] = set(mean_df.index[z_scores < -significant_threshold])

# Step 8: Plot UpSet plots for proteins with increased and decreased expressions compared to control
def plot_upset(data, title, bar_color, filename):
    if any(len(d) > 0 for d in data.values()):
        keys_sorted = sorted(data.keys())
        sorted_data = {key: data[key] for key in keys_sorted}
        upset_data = from_contents(sorted_data)
        upset_plot = UpSet(upset_data, show_counts='%d', element_size=50, sort_categories_by=None, sort_by='cardinality')
        axes = upset_plot.plot()
        for bar in axes['intersections'].patches:
            bar.set_facecolor(bar_color)  # Color the vertical bars
        plt.title(title, fontsize=title_fontsize)
        plt.xticks(fontsize=tick_fontsize)
        plt.yticks(fontsize=tick_fontsize)
        plt.xlabel('Conditions', fontsize=label_fontsize)
        plt.ylabel('Number of Proteins', fontsize=label_fontsize)
        plt.savefig(filename, format='svg')
        plt.show()
    else:
        print(f"No significant {title.lower()} found.")

# Font size settings
title_fontsize = 20
label_fontsize = 18
tick_fontsize = 16

# Plot and save the UpSet plots as SVG files
plot_upset(upregulated_data, 'General Increase in Expression Relative to Control', 'darkred', 'upregulated.svg')
plot_upset(downregulated_data, 'General Decrease in Expression Relative to Control', 'darkblue', 'downregulated.svg')

# Download the SVG files
files.download('upregulated.svg')
files.download('downregulated.svg')