-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython_functions
More file actions
31 lines (26 loc) · 984 Bytes
/
python_functions
File metadata and controls
31 lines (26 loc) · 984 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# load GTF file into python and parse the attributes field
# --------------------------------------------------------
import gffutils
import pandas as pd
db = gffutils.create_db(gtf_file_name, ':memory:', disable_infer_transcripts=True, disable_infer_genes=True)
def process_attributes(feature):
# Attributes are stored as lists in gffutils
return {key: ','.join(value) for key, value in feature.attributes.items()}
# Iterate over all features
features_data = []
for feature in db.all_features():
# Extract standard GFF columns
data = {
'seq_id': feature.seqid,
'source': feature.source,
'type': feature.featuretype,
'start': feature.start,
'end': feature.end,
'score': feature.score,
'strand': feature.strand,
}
# Parse attributes
data.update(process_attributes(feature))
features_data.append(data)
# Convert the list of dictionaries to a pandas DataFrame
gtf = pd.DataFrame(features_data)