-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathestimate_DNA_stochastic.py
More file actions
52 lines (48 loc) · 1.81 KB
/
estimate_DNA_stochastic.py
File metadata and controls
52 lines (48 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import numpy as np
import pickle
import sys
def condition_check(s):
if len(s) < 3:
return True
if s[-3:] == "ATG":
return False
if len(s) < 5:
return True
if s[-5:] in {"TTGCA", "CCCAT"}:
return False
if s[-1] == s[-2] == s[-3] == s[-4] == s[-5]:
return False
if len(s) < 6:
return True
if s[-6:] == "TATAAT":
return False
return True
if __name__ == "__main__":
try:
# We need a markov chain of order 6
current_str = ""
symbols = ["A", "C", "G", "T"]
states = {} #"sb0, sb1, sb2, sb3, sb4, sb5" -> "sb'"
iter = 0
# we simulate the constraint to construct the stochastic matrices
while True:
print(f"iter = {iter}", end = "\r", flush=True)
iter += 1
rand_sym = symbols[np.random.randint(0, 4)]
proposed_next_str = current_str + rand_sym
while not(condition_check(proposed_next_str)):
rand_sym = symbols[np.random.randint(0, 4)]
proposed_next_str = current_str + rand_sym
if len(current_str) >= 6:
if current_str[-6:] not in states:
states[current_str[-6:]] = {}
states[current_str[-6:]][rand_sym] = states[current_str[-6:]].get(rand_sym, 0) + 1
current_str = proposed_next_str
except KeyboardInterrupt as e:
# Verify C and G combined takes up ~ half
rat_CG = sum([1 if s in {"C", "G"} else 0 for s in current_str]) / len(current_str)
print(f"ratio (C,G) = {rat_CG}")
# Write "stochastic matrix" to some file
with open("DNA_stochastic.pkl", "wb") as f:
pickle.dump(states, f)
sys.exit(0)