-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_utils.py
More file actions
64 lines (51 loc) · 1.89 KB
/
Copy pathdata_utils.py
File metadata and controls
64 lines (51 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import numpy as np
import pandas as pd
import torch
from typing import Any, Callable, Dict, List, Optional, Tuple
from binn.data import (
ReactomeNetwork,
prepareAnnData,
anndata_to_graph_data,
get_map
)
from torch_geometric.transforms import BaseTransform
from torch_geometric.transforms import RandomNodeSplit
class _SafeFrame(pd.DataFrame):
@property
def _constructor(self):
return _SafeFrame
def __getitem__(self, key):
if isinstance(key, set):
key = list(key)
return super().__getitem__(key)
# Prostate paper: another matter
def load_and_merge_tables(data_dir: str) -> pd.DataFrame:
cna_path = os.path.join(data_dir, "P1000_data_CNA_paper.csv")
response_path = os.path.join(data_dir, "response_paper.csv")
clinical_path = os.path.join(data_dir, "prad_p1000_clinical_final.txt")
cna = pd.read_csv(cna_path).set_index("Unnamed: 0")
cna["id"] = cna.index
response = pd.read_csv(response_path)
clinical = pd.read_csv(clinical_path, delimiter="\t")
clinical["id"] = clinical["comp_id"]
merged = cna.merge(response, how="inner", on="id").merge(clinical, how="inner", on="id")
return _SafeFrame(merged)
# FIXME: can we modify to be independant of graph_data?
def prepare_graph_data(
merged_df: pd.DataFrame,
obs_vars: List[str],
reactome_net: ReactomeNetwork,
data_dir: str,
):
# print("Map result")
adata, map_df = prepareAnnData(data=merged_df,obs_vars=obs_vars,map=get_map(reactome_net,n_levels=3))
# ad_prep = PrepareAnnData(data=merged_df, obs_vars=obs_vars, map=get_map(reactome_net, n_levels=3))
# adata, map_df = ad_prep.anndata()
data = anndata_to_graph_data(
adata=adata,
group="response",
knn=3,
transform=RandomNodeSplit(split="random", num_train_per_class=200, num_val=200, num_test=314),
)
return data, adata, map_df