From 3fceec299b44bbf0926546e9d9c6b846e8d06dcf Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Tue, 21 Oct 2025 18:42:47 -0400 Subject: [PATCH 01/10] update: plotting one-hop distribution on phising indices --- .../weak_supervision_experiment.py | 22 ++++++++++++++++--- tgrag/utils/plot.py | 18 +++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py index cd7a3b68..eb024a81 100644 --- a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py +++ b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py @@ -16,6 +16,7 @@ from tgrag.utils.logger import setup_logging from tgrag.utils.matching import reverse_domain from tgrag.utils.path import get_root_dir, get_scratch +from tgrag.utils.plot import plot_neighbor_distribution from tgrag.utils.seed import seed_everything parser = argparse.ArgumentParser( @@ -34,6 +35,7 @@ def run_weak_supervision_forward( model_arguments: ModelArguments, dataset: TemporalDataset, weight_directory: Path, + target: str, ) -> None: root = get_root_dir() phishing_dict: Dict[str, str] = { @@ -72,8 +74,8 @@ def run_weak_supervision_forward( phishing_loader = NeighborLoader( data, input_nodes=phishing_indices, - num_neighbors=[30, 30, 30], - batch_size=1024, + num_neighbors=model_arguments.num_neighbors, + batch_size=model_arguments.batch_size, shuffle=False, ) logging.info( @@ -82,14 +84,27 @@ def run_weak_supervision_forward( num_nodes = data.num_nodes all_preds = torch.zeros(num_nodes, 1) + neighbor_preds = [] with torch.no_grad(): for batch in tqdm(phishing_loader, desc=f'{dataset_name} batch'): batch = batch.to(device) preds = model(batch.x, batch.edge_index) seed_nodes = batch.n_id[: batch.batch_size] + + pred_neighbors = preds[batch.batch_size :] + neighbor_preds.append(pred_neighbors.cpu()) + all_preds[seed_nodes] = preds[: batch.batch_size].cpu() + neighbor_preds = torch.cat(neighbor_preds, dim=0) + + plot_neighbor_distribution( + neighbor_preds=neighbor_preds, + dataset_name=dataset_name, + model_name=model_arguments.model, + target=target, + ) preds = all_preds[phishing_indices] logging.info(f'Number of predictions: {preds.size()}') logging.info(f'Predictions: {preds}') @@ -145,7 +160,7 @@ def main() -> None: encoding=encoding_dict, seed=meta_args.global_seed, processed_dir=f'{scratch}/{meta_args.processed_location}', - ) # Map to .to_cpu() + ) logging.info('In-Memory Dataset loaded.') weight_directory = ( root / cast(str, meta_args.weights_directory) / f'{meta_args.target_col}' @@ -157,6 +172,7 @@ def main() -> None: experiment_arg.model_args, dataset, weight_directory, + target=meta_args.target_col, ) diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index bc14d762..a9edc52a 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1015,3 +1015,21 @@ def plot_pred_target_distributions_histogram( plt.tight_layout() plt.savefig(save_path, bbox_inches='tight', pad_inches=0.1) plt.close() + + +def plot_neighbor_distribution( + neighbor_preds: Tensor, dataset_name: str, model_name: str, target: str +) -> None: + root = get_root_dir() + save_dir = root / 'results' / 'plots' / model_name / 'distribution' / target + save_dir.mkdir(parents=True, exist_ok=True) + save_path = save_dir / f'{dataset_name}_neighbor_pred_distribution.png' + plt.figure(figsize=(6, 4)) + plt.hist(neighbor_preds.numpy(), bins=20, range=(0, 1), edgecolor='black') + plt.title(f'Predicted Label Distribution (Neighbors) — {dataset_name}') + plt.xlabel('Predicted label (0, 1)') + plt.ylabel('Frequency') + plt.grid(alpha=0.3) + plt.tight_layout() + plt.savefig(save_path) + plt.close() From 9cc9f442e936530e8da65a9c21f9d8358ba92787 Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Tue, 21 Oct 2025 19:17:47 -0400 Subject: [PATCH 02/10] update: fixing logging statements --- .../experiments/gnn_experiments/weak_supervision_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py index eb024a81..99ffce5e 100644 --- a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py +++ b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py @@ -105,9 +105,9 @@ def run_weak_supervision_forward( model_name=model_arguments.model, target=target, ) + logging.info(f'Saving distribution of {dataset_name}') preds = all_preds[phishing_indices] logging.info(f'Number of predictions: {preds.size()}') - logging.info(f'Predictions: {preds}') for threshold in [0.1, 0.3, 0.5]: upper = dataset_name == 'IP2Location' accuracy = get_accuracy(preds, threshold=threshold, upper=upper) From 9cc593c339b3a1ac87dafa69e8079cfea8c63e88 Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 11:15:31 -0400 Subject: [PATCH 03/10] update: plotting neighborhood degree distribution --- .../weak_supervision_experiment.py | 33 ++++++++++++++++++- tgrag/utils/plot.py | 22 +++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py index 99ffce5e..94f2ffc6 100644 --- a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py +++ b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py @@ -6,6 +6,7 @@ import pandas as pd import torch from torch_geometric.loader import NeighborLoader +from torch_geometric.utils import degree from tqdm import tqdm from tgrag.dataset.temporal_dataset import TemporalDataset @@ -16,7 +17,10 @@ from tgrag.utils.logger import setup_logging from tgrag.utils.matching import reverse_domain from tgrag.utils.path import get_root_dir, get_scratch -from tgrag.utils.plot import plot_neighbor_distribution +from tgrag.utils.plot import ( + plot_neighbor_degree_distribution, + plot_neighbor_distribution, +) from tgrag.utils.seed import seed_everything parser = argparse.ArgumentParser( @@ -44,6 +48,13 @@ def run_weak_supervision_forward( 'PhishTank': 'data/phishing_data/cc_dec_2024_phishtank_domains.csv', } data = dataset[0] + + src, dst = data.edge_index + logging.info(f'Src, dst degrees loaded.') + + out_degree = degree(src, num_nodes=data.num_nodes) + in_degree = degree(dst, num_nodes=data.num_nodes) + device = f'cuda:{model_arguments.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) logging.info(f'Device found: {device}') @@ -85,6 +96,7 @@ def run_weak_supervision_forward( num_nodes = data.num_nodes all_preds = torch.zeros(num_nodes, 1) neighbor_preds = [] + neighbor_nodes = set() with torch.no_grad(): for batch in tqdm(phishing_loader, desc=f'{dataset_name} batch'): @@ -94,10 +106,15 @@ def run_weak_supervision_forward( pred_neighbors = preds[batch.batch_size :] neighbor_preds.append(pred_neighbors.cpu()) + neighbor_nodes.update(batch.n_id[batch.batch_size :].tolist()) all_preds[seed_nodes] = preds[: batch.batch_size].cpu() neighbor_preds = torch.cat(neighbor_preds, dim=0) + neighbor_nodes = torch.tensor(list(neighbor_nodes), dtype=torch.long) + + neighbor_in_degree = in_degree[neighbor_nodes] + neighbor_out_degree = out_degree[neighbor_nodes] plot_neighbor_distribution( neighbor_preds=neighbor_preds, @@ -105,6 +122,20 @@ def run_weak_supervision_forward( model_name=model_arguments.model, target=target, ) + plot_neighbor_degree_distribution( + neighbor_degree=neighbor_in_degree, + dataset_name=dataset_name, + model_name=model_arguments.model, + target=target, + degree='In-degree', + ) + plot_neighbor_degree_distribution( + neighbor_degree=neighbor_out_degree, + dataset_name=dataset_name, + model_name=model_arguments.model, + target=target, + degree='Out-degree', + ) logging.info(f'Saving distribution of {dataset_name}') preds = all_preds[phishing_indices] logging.info(f'Number of predictions: {preds.size()}') diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index a9edc52a..2edfce81 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1033,3 +1033,25 @@ def plot_neighbor_distribution( plt.tight_layout() plt.savefig(save_path) plt.close() + + +def plot_neighbor_degree_distribution( + neighbor_degree: Tensor, + dataset_name: str, + model_name: str, + target: str, + degree: str, +) -> None: + root = get_root_dir() + save_dir = root / 'results' / 'plots' / model_name / 'distribution' / target + save_dir.mkdir(parents=True, exist_ok=True) + save_path = save_dir / f'{dataset_name}_neighbor_{degree}_degree_distribution.png' + plt.figure(figsize=(6, 4)) + plt.hist(neighbor_degree.numpy(), bins=20, range=(0, 1), edgecolor='black') + plt.title(f'{degree} Distribution (Neighbors) — {dataset_name}') + plt.xlabel(f'{degree}') + plt.ylabel('Frequency') + plt.grid(alpha=0.3) + plt.tight_layout() + plt.savefig(save_path) + plt.close() From ab31f9923b72aa35df67129e9f5a7e88e94964b9 Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 11:29:14 -0400 Subject: [PATCH 04/10] fix: plotting now plotting discrete --- .../gnn_experiments/weak_supervision_experiment.py | 4 ++-- tgrag/utils/plot.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py index 94f2ffc6..0645b113 100644 --- a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py +++ b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py @@ -52,8 +52,8 @@ def run_weak_supervision_forward( src, dst = data.edge_index logging.info(f'Src, dst degrees loaded.') - out_degree = degree(src, num_nodes=data.num_nodes) - in_degree = degree(dst, num_nodes=data.num_nodes) + out_degree = degree(src, num_nodes=data.num_nodes, dtype=torch.long) + in_degree = degree(dst, num_nodes=data.num_nodes, dtype=torch.long) device = f'cuda:{model_arguments.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index 2edfce81..810670e1 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1047,7 +1047,12 @@ def plot_neighbor_degree_distribution( save_dir.mkdir(parents=True, exist_ok=True) save_path = save_dir / f'{dataset_name}_neighbor_{degree}_degree_distribution.png' plt.figure(figsize=(6, 4)) - plt.hist(neighbor_degree.numpy(), bins=20, range=(0, 1), edgecolor='black') + plt.hist( + neighbor_degree.numpy(), + bins=range(int(neighbor_degree.max() + 2)), + edgecolor='black', + align='left', + ) plt.title(f'{degree} Distribution (Neighbors) — {dataset_name}') plt.xlabel(f'{degree}') plt.ylabel('Frequency') From 2a908517920c78f5607450a46ca3ca362ef784be Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 11:42:47 -0400 Subject: [PATCH 05/10] update: using log space --- tgrag/utils/plot.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index 810670e1..35348be4 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1047,9 +1047,13 @@ def plot_neighbor_degree_distribution( save_dir.mkdir(parents=True, exist_ok=True) save_path = save_dir / f'{dataset_name}_neighbor_{degree}_degree_distribution.png' plt.figure(figsize=(6, 4)) + + deg = neighbor_degree.numpy() + deg = deg[deg > 0] + bins = np.logspace(np.log10(deg.min()), np.log10(deg.max()), 50) plt.hist( - neighbor_degree.numpy(), - bins=range(int(neighbor_degree.max() + 2)), + deg, + bins=bins, edgecolor='black', align='left', ) From ce007b32c2e4ed815a1aa83ac52ea6e1c64a6d93 Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 11:52:25 -0400 Subject: [PATCH 06/10] update: logging added --- .../experiments/gnn_experiments/weak_supervision_experiment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py index 0645b113..6788505b 100644 --- a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py +++ b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py @@ -114,7 +114,9 @@ def run_weak_supervision_forward( neighbor_nodes = torch.tensor(list(neighbor_nodes), dtype=torch.long) neighbor_in_degree = in_degree[neighbor_nodes] + logging.info(f'Size of in-degree tensor: {neighbor_in_degree.size()}') neighbor_out_degree = out_degree[neighbor_nodes] + logging.info(f'Size of out-degree tensor: {neighbor_out_degree.size()}') plot_neighbor_distribution( neighbor_preds=neighbor_preds, From aa609eee0b13f864cf01831b38f2ee1a65020206 Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 12:05:02 -0400 Subject: [PATCH 07/10] update: plotting unique degrees --- tgrag/utils/plot.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index 35348be4..01151362 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1050,12 +1050,19 @@ def plot_neighbor_degree_distribution( deg = neighbor_degree.numpy() deg = deg[deg > 0] - bins = np.logspace(np.log10(deg.min()), np.log10(deg.max()), 50) + + unique_deg, counts = torch.unique(deg, return_counts=True) + + sorted_idx = torch.argsort(unique_deg) + unique_deg = unique_deg[sorted_idx] + counts = counts[sorted_idx] + plt.hist( - deg, - bins=bins, + unique_deg.numpy(), + counts.numpy(), + width=0.8, edgecolor='black', - align='left', + align='center', ) plt.title(f'{degree} Distribution (Neighbors) — {dataset_name}') plt.xlabel(f'{degree}') From 2b141340575ceadb07285386ac08051127cee3bd Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 12:16:10 -0400 Subject: [PATCH 08/10] fix: unique deg was set to numpy --- tgrag/utils/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index 01151362..fee45a23 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1048,7 +1048,7 @@ def plot_neighbor_degree_distribution( save_path = save_dir / f'{dataset_name}_neighbor_{degree}_degree_distribution.png' plt.figure(figsize=(6, 4)) - deg = neighbor_degree.numpy() + deg = neighbor_degree deg = deg[deg > 0] unique_deg, counts = torch.unique(deg, return_counts=True) From 6bcb952de37eea68356c87f8d7517daa271d42d4 Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 12:22:10 -0400 Subject: [PATCH 09/10] fix: bar not hist --- tgrag/utils/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tgrag/utils/plot.py b/tgrag/utils/plot.py index fee45a23..10c51884 100644 --- a/tgrag/utils/plot.py +++ b/tgrag/utils/plot.py @@ -1057,7 +1057,7 @@ def plot_neighbor_degree_distribution( unique_deg = unique_deg[sorted_idx] counts = counts[sorted_idx] - plt.hist( + plt.bar( unique_deg.numpy(), counts.numpy(), width=0.8, From 615c036c0aae780c7239232e3355f390eebff19a Mon Sep 17 00:00:00 2001 From: sebastian9991 Date: Wed, 22 Oct 2025 12:33:02 -0400 Subject: [PATCH 10/10] logging sample output --- .../experiments/gnn_experiments/weak_supervision_experiment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py index 6788505b..deba68a1 100644 --- a/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py +++ b/tgrag/experiments/gnn_experiments/weak_supervision_experiment.py @@ -115,8 +115,10 @@ def run_weak_supervision_forward( neighbor_in_degree = in_degree[neighbor_nodes] logging.info(f'Size of in-degree tensor: {neighbor_in_degree.size()}') + logging.info(f'Sample of in-degree: {neighbor_in_degree[:10]}') neighbor_out_degree = out_degree[neighbor_nodes] logging.info(f'Size of out-degree tensor: {neighbor_out_degree.size()}') + logging.info(f'Sample of out-degree: {neighbor_out_degree[:10]}') plot_neighbor_distribution( neighbor_preds=neighbor_preds,