invitroMuscleDystrophyAnchoredCellSheet/code5_violin_plot.py at main · Evolved-Bio/invitroMuscleDystrophyAnchoredCellSheet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from google.colab import files
import os
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, levene
from statsmodels.stats.multitest import multipletests
from matplotlib.ticker import FixedFormatter

def upload_files():
    print("Uploading files...")
    return files.upload()

def create_combined_violin_plot(dfs, names, figsize=(12, 5)):
    plt.figure(figsize=figsize)
    all_differences = []
    all_comparisons = []

    for df, name in zip(dfs, names):
        log2_fold_changes = df['Difference']
        all_differences.extend(log2_fold_changes)
        all_comparisons.extend([name] * len(df))

    formatted_comparisons = [name.replace(' vs ', '\nvs\n').replace(' ', '\n') for name in all_comparisons]

    combined_df = pd.DataFrame({
        'Comparison': formatted_comparisons,
        'Log2 Fold Change': all_differences
    })

    sns.violinplot(x='Comparison', y='Log2 Fold Change', data=combined_df, palette='Set2', inner=None)
    plt.xticks(rotation=0, ha='center', fontsize=12)
    plt.yticks([-40, -20, 0, 20, 40], fontsize=12)
    plt.ylabel('Log2 Fold Change', fontsize=14)
    plt.xlabel('', fontsize=14)
    plt.title('Violin Plot', fontsize=16)
    plt.grid(True)
    plt.tight_layout()

    plot_path = 'Combined_Violin_Plot.svg'
    plt.savefig(plot_path)
    plt.show()

    return combined_df, plot_path

def perform_statistical_analysis(dfs, names):
    results = []
    for i in range(len(dfs)):
        for j in range(i + 1, len(dfs)):
            differences_i = dfs[i]['Difference']
            differences_j = dfs[j]['Difference']

            # Normality tests
            stat_i, p_value_i = shapiro(differences_i)
            stat_j, p_value_j = shapiro(differences_j)
            normal_i = p_value_i > 0.05
            normal_j = p_value_j > 0.05

            # Variance homogeneity test
            stat, p_value_levene = levene(differences_i, differences_j)
            equal_var = p_value_levene > 0.05

            # Decide which test to use
            if normal_i and normal_j and equal_var:
                t_stat, p_value = ttest_ind(differences_i, differences_j, equal_var=True)
                test_used = 't-test'
            elif normal_i and normal_j:
                t_stat, p_value = ttest_ind(differences_i, differences_j, equal_var=False)
                test_used = 'Welch\'s t-test'
            else:
                u_stat, p_value = mannwhitneyu(differences_i, differences_j)
                test_used = 'Mann-Whitney U'

            results.append({
                'Comparison 1': names[i],
                'Comparison 2': names[j],
                'Test Used': test_used,
                't Statistic': t_stat if test_used != 'Mann-Whitney U' else 'N/A',
                'U Statistic': u_stat if test_used == 'Mann-Whitney U' else 'N/A',
                'p-value': p_value
            })

    results_df = pd.DataFrame(results)

    # Multiple testing correction
    corrected_pvals = multipletests(results_df['p-value'], method='fdr_bh')[1]
    results_df['Adjusted p-value'] = corrected_pvals

    return results_df

def create_statistical_dot_plot(stats_df, figsize=(10, 8), dot_size=(20, 200), font_size=12):
    plt.figure(figsize=figsize)
    stats_df['-log10(p-value)'] = -np.log10(stats_df['Adjusted p-value'])

    # Modify labels to be displayed in multiple lines
    stats_df['Comparison 1'] = stats_df['Comparison 1'].str.replace(' vs ', '\nvs\n').str.replace(' ', '\n')
    stats_df['Comparison 2'] = stats_df['Comparison 2'].str.replace(' vs ', '\nvs\n').str.replace(' ', '\n')

    # Create dot plot
    ax = sns.scatterplot(data=stats_df, x='Comparison 1', y='Comparison 2', size='-log10(p-value)', hue='-log10(p-value)', palette='viridis', sizes=dot_size)
    plt.title('Dot Plot of Adjusted p-values for Pairwise Comparisons', fontsize=font_size + 2)
    plt.xticks(rotation=0, ha='center', fontsize=font_size)
    ax.yaxis.set_major_formatter(FixedFormatter(ax.get_yticklabels()))
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='right', fontsize=font_size, rotation_mode='anchor')
    plt.xlabel('Comparison 1', fontsize=font_size)
    plt.ylabel('Comparison 2', fontsize=font_size)
    plt.tight_layout()

    dot_plot_path = 'Statistical_Dot_Plot.svg'
    plt.savefig(dot_plot_path)
    plt.show()

    return dot_plot_path

def process_files(uploaded_files, figsize=(12, 5), dot_size=(20, 200), font_size=12):
    dfs = []
    names = []
    for filename, content in uploaded_files.items():
        try:
            with open(filename, 'wb') as f:
                f.write(content)
            df = pd.read_csv(filename)
            if 'Difference' not in df.columns:
                raise ValueError(f"File {filename} does not contain 'Difference' column.")
            base_name = filename.split('.')[0]
            dfs.append(df)
            names.append(base_name)
            os.remove(filename)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    if not dfs:
        print("No valid files to process.")
        return

    stats_df = perform_statistical_analysis(dfs, names)
    stats_file_path = 'statistical_analysis_results.csv'
    stats_df.to_csv(stats_file_path, index=False)
    files.download(stats_file_path)

    combined_df, plot_path = create_combined_violin_plot(dfs, names, figsize)
    dot_plot_path = create_statistical_dot_plot(stats_df, figsize=(10, 8), dot_size=dot_size, font_size=font_size)

    files.download(plot_path)
    files.download(dot_plot_path)

def main():
    uploaded_files = upload_files()
    process_files(uploaded_files, figsize=(12, 5), dot_size=(50, 500), font_size=14)

main()