-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_expression_matrix.py
More file actions
executable file
·159 lines (123 loc) · 5.14 KB
/
plot_expression_matrix.py
File metadata and controls
executable file
·159 lines (123 loc) · 5.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
# Set matplotlib backend before importing
import matplotlib
matplotlib.use('Agg') # Use non-interactive backend
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def load_and_clean_data(matrix_file):
"""Load and clean the expression matrix data."""
print(f"Loading expression matrix from {matrix_file}")
# Read the file with explicit delimiter and handle whitespace
df = pd.read_csv(matrix_file, sep='\s+')
print(f"Initial data shape: {df.shape}")
# Set the first column as index
df = df.set_index('Gene_ID')
print(f"Data shape after setting index: {df.shape}")
# Convert all values to numeric
df = df.apply(pd.to_numeric)
print(f"Data shape after conversion: {df.shape}")
# Basic statistics
print("\nBasic statistics:")
print(df.describe())
return df
def plot_sample_correlations(df, output_dir):
"""Generate correlation scatter plots between replicates."""
print("Generating sample correlation plots...")
# Calculate correlation matrix
corr = df.corr()
# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Sample Correlation Heatmap')
plt.tight_layout()
plt.savefig(f"{output_dir}/sample_correlation_heatmap.png", dpi=300)
plt.close()
# Create pairwise scatter plots for replicates
tissues = ['liver', 'muscle', 'brain', 'kidney']
for tissue in tissues:
rep1 = f"{tissue}_rep1"
rep2 = f"{tissue}_rep2"
plt.figure(figsize=(8, 8))
plt.scatter(np.log2(df[rep1] + 1), np.log2(df[rep2] + 1), alpha=0.5, s=1)
plt.xlabel(f"{rep1} (log2(counts + 1))")
plt.ylabel(f"{rep2} (log2(counts + 1))")
plt.title(f"{tissue.capitalize()} Replicate Comparison")
# Add correlation coefficient
corr_val = df[rep1].corr(df[rep2])
plt.text(0.05, 0.95, f"r = {corr_val:.3f}",
transform=plt.gca().transAxes)
# Add density to show where most points are
plt.hist2d(np.log2(df[rep1] + 1), np.log2(df[rep2] + 1),
bins=50, cmap='viridis', alpha=0.3)
plt.tight_layout()
plt.savefig(f"{output_dir}/{tissue}_replicate_scatter.png", dpi=300)
plt.close()
def plot_ma(df, output_dir):
"""Generate MA plots for each tissue comparison."""
print("Generating MA plots...")
tissues = ['liver', 'muscle', 'brain', 'kidney']
for tissue in tissues:
rep1 = f"{tissue}_rep1"
rep2 = f"{tissue}_rep2"
# Calculate M (log ratio) and A (mean average)
M = np.log2((df[rep1] + 1) / (df[rep2] + 1))
A = 0.5 * np.log2((df[rep1] + 1) * (df[rep2] + 1))
plt.figure(figsize=(8, 8))
# Add density to show where most points are
plt.hist2d(A, M, bins=50, cmap='viridis', alpha=0.3)
# Add scatter plot with smaller points
plt.scatter(A, M, alpha=0.5, s=1, c='black')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('A (average log expression)')
plt.ylabel('M (log ratio)')
plt.title(f"{tissue.capitalize()} MA Plot")
plt.tight_layout()
plt.savefig(f"{output_dir}/{tissue}_MA_plot.png", dpi=300)
plt.close()
def plot_pca(df, output_dir):
"""Generate PCA plot of all samples."""
print("Generating PCA plot...")
# Prepare data for PCA
X = np.log2(df + 1) # Log transform
X = StandardScaler().fit_transform(X.T) # Scale the data
# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)
# Calculate variance explained
var_explained = pca.explained_variance_ratio_ * 100
# Create DataFrame for plotting
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
pca_df['sample'] = df.columns
pca_df['tissue'] = [x.split('_')[0] for x in df.columns]
# Plot
plt.figure(figsize=(10, 8))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='tissue', style='tissue', s=100)
plt.title('PCA Plot of Samples')
plt.xlabel(f'PC1 ({var_explained[0]:.1f}% variance explained)')
plt.ylabel(f'PC2 ({var_explained[1]:.1f}% variance explained)')
# Add sample labels
for idx, row in pca_df.iterrows():
plt.annotate(row['sample'], (row['PC1'], row['PC2']))
plt.tight_layout()
plt.savefig(f"{output_dir}/pca_plot.png", dpi=300)
plt.close()
def main():
# Define input/output paths
matrix_file = "chicken_atlas/expression_matrix_clean.txt"
output_dir = "chicken_atlas/plots"
# Create output directory if it doesn't exist
import os
os.makedirs(output_dir, exist_ok=True)
# Load and clean data
df = load_and_clean_data(matrix_file)
# Generate plots
plot_sample_correlations(df, output_dir)
plot_ma(df, output_dir)
plot_pca(df, output_dir)
print(f"Plots have been saved to {output_dir}")
if __name__ == "__main__":
main()