-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmotif_utils.py
More file actions
82 lines (70 loc) · 2.29 KB
/
motif_utils.py
File metadata and controls
82 lines (70 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import logomaker
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy
import numpy as np
def homer_reader(fname):
"""read homer files into a dataframes
fmame: .homore file
"""
with open(fname) as f:
flines = f.readlines()
all_matrix = []
all_fields = []
motif_no = 0
for line in flines:
if line[0] == ">":
if motif_no > 0:
all_matrix.append(
pd.DataFrame(matrix_lines, columns=["A", "C", "G", "U"])
)
all_fields.append(fields)
matrix_lines = []
fields = dict(
zip(
["log odds", "log pval"],
[float(l) for l in line.split("\t")[2:4]],
)
)
motif_no += 1
else:
# matrix
matrix_lines.append([float(l) for l in line.split("\t")])
return all_matrix, all_fields
def plot_motif(df, label="bits", ax=None, title="", information_content=True, **kwargs):
"""df: index:[ATCG], col: integer pos, PWM"""
rc = {
"axes.spines.left": False,
"axes.spines.right": False,
"axes.spines.bottom": False,
"axes.spines.top": False,
"xtick.bottom": False,
"xtick.labelbottom": False,
"ytick.labelleft": False,
"ytick.left": False,
}
plt.rcParams.update(rc)
information_content = 2 - df.apply(entropy, axis=0)
bits = df * information_content
# top 20
bits.columns = np.arange(bits.shape[1])
ww_logo = logomaker.Logo(bits.T, vpad=0.1, width=0.8, ax=ax, **kwargs)
ww_logo.ax.set_ylabel(label)
try:
ax.set_title(title)
except:
pass
def get_strongest_motif(filename, sort_by="log pval"):
motifs, attributes = homer_reader(filename)
if len(attributes) == 0:
return None, None
attr_df = pd.DataFrame(attributes)
# sort motif by l2 odds
if sort_by in attr_df.columns:
best_index = attr_df[sort_by].idxmax()
ma = motifs[best_index]
attr = attributes[best_index]
pv = attr["log pval"]
odds = attr["log odds"]
seq = "".join(ma.idxmax(axis=1).tolist())
return ma, [seq, pv, odds]