CellPathway/cellpathway_tad.py at main · WGLab/CellPathway · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
"""CellPathway Step 8: TAD annotation and gene mapping.

Usage
-----
    python cellpathway_tad.py \
        --overlap-bed example/dnm_enhc_overlap_cadd_10/Fetal_brain_dnm.bed \
        --tad-file data/tad_w_boundary_08.bed \
        --elements-bb data/genes_w_noncoding.bb \
        --gene-list example/SFARI_Gene.csv \
        --output example/tad_Fetal_brain_autism.csv
"""

import argparse
import pandas as pd
from pybedtools import BedTool
import pyBigWig


def superset_tad(query_region, tad_r, tad_b):
    """Determine whether a query region is within a TAD or crosses a boundary."""
    if isinstance(query_region, (list, tuple)):
        query_region = ' '.join(str(i) for i in query_region)
    query_bed = BedTool(query_region, from_string=True)
    tad_r_bed = BedTool.from_dataframe(tad_r)
    tad_b_bed = BedTool.from_dataframe(tad_b)

    inter_b = tad_b_bed.intersect(query_bed, wa=True, wb=True).to_dataframe()

    if inter_b.shape[0] == 0:
        inter_r = tad_r_bed.intersect(query_bed, wa=True).to_dataframe()
        if inter_r.shape[0] == 0:
            return [None, None, None], "outside_TAD"
        return inter_r.values.tolist()[0], "in_TAD"

    boundary_selected = ' '.join([
        inter_b.chrom.iloc[0],
        str(min(inter_b.start)),
        str(max(inter_b.end))
    ])
    boundary_bed = BedTool(boundary_selected, from_string=True)
    fu = boundary_bed.closest(tad_r_bed, D='ref', fu=True).to_dataframe()
    fd = boundary_bed.closest(tad_r_bed, D='ref', fd=True).to_dataframe()
    region = [fu.values[0][0], fu.values[0][4], fd.values[0][5]]
    return region, "cross_TAD_boundary"


def tad_elements(element_bb, tad_region):
    """Get all genes/elements within a TAD region."""
    if None in tad_region:
        return [], None

    try:
        entries = element_bb.entries(
            tad_region[0], int(tad_region[1]), int(tad_region[2])
        )
    except Exception:
        return [], None

    gene_names = []
    rows = []
    for entry in entries:
        name, indicator = entry[2].split('\t')
        rows.append([tad_region[0], entry[0], entry[1], name])
        if indicator == '1':
            gene_names.append(name)

    elements_df = pd.DataFrame(rows, columns=["chrom", "start", "end", "feature"])
    return gene_names, elements_df


def annotate_tad(overlap_bed_path, tad_path, elements_bb_path,
                 gene_list=None, output_path=None):
    """Map enhancer-DNM overlaps to TADs and identify genes."""
    # Load TAD data
    tad = pd.read_csv(tad_path, sep="\t", header=None,
                       names=["chrom", "start", "end", "label"])
    tad_r = tad.loc[tad["label"].astype(str) == "0",
                     ["chrom", "start", "end"]].reset_index(drop=True)
    tad_b = tad.loc[tad["label"].astype(str) == "1",
                     ["chrom", "start", "end"]].reset_index(drop=True)

    element_bb = pyBigWig.open(elements_bb_path, "r")

    # Load overlap BED and filter to regions with overlaps
    overlap = pd.read_csv(overlap_bed_path, sep="\t", header=None)
    enhancers = overlap[overlap[3] > 0][[0, 1, 2]]
    enhancers.columns = ["chrom", "start", "end"]

    gene_set = set(g.upper() for g in gene_list) if gene_list else None

    results = []
    for _, row in enhancers.iterrows():
        enhc_tri = [row["chrom"], int(row["start"]), int(row["end"])]
        tad_region, status = superset_tad(enhc_tri, tad_r, tad_b)
        gene_names, _ = tad_elements(element_bb, tad_region)

        record = {
            "enh_chr": row["chrom"],
            "enh_start": row["start"],
            "enh_end": row["end"],
            "tad_chr": tad_region[0],
            "tad_start": tad_region[1],
            "tad_end": tad_region[2],
            "status": status,
            "gene_names": ",".join(gene_names) if gene_names else None,
            "n_genes": len(gene_names),
        }

        if gene_set is not None:
            genes_upper = [g.strip().upper() for g in gene_names]
            overlap_genes = [g for g in genes_upper if g in gene_set]
            record["known_gene_overlap"] = ",".join(overlap_genes)

        results.append(record)

    element_bb.close()
    results_df = pd.DataFrame(results)

    if output_path:
        results_df.to_csv(output_path, index=False)
        print(f"TAD annotation saved to: {output_path}")

    return results_df


def main():
    parser = argparse.ArgumentParser(
        description="CellPathway: TAD annotation and gene mapping (Step 8)",
    )
    parser.add_argument("--overlap-bed", required=True,
                        help="Overlap BED file from enrichment step")
    parser.add_argument("--tad-file", required=True,
                        help="TAD BED file (col4 = 0/1 label)")
    parser.add_argument("--elements-bb", required=True,
                        help="Gene annotation .bb file")
    parser.add_argument("--gene-list", default=None,
                        help="CSV with known disease genes (column 'gene-symbol')")
    parser.add_argument("--output", required=True,
                        help="Output CSV path")
    args = parser.parse_args()

    gene_list = None
    if args.gene_list:
        genes_df = pd.read_csv(args.gene_list)
        gene_list = genes_df["gene-symbol"].tolist()

    results = annotate_tad(
        overlap_bed_path=args.overlap_bed,
        tad_path=args.tad_file,
        elements_bb_path=args.elements_bb,
        gene_list=gene_list,
        output_path=args.output,
    )
    print(f"\nTAD annotation rows: {len(results)}")
    print(results.head(10).to_string(index=False))


if __name__ == "__main__":
    main()