From de1e3571c625f189b4f9a5074b3f85ee2cb5be30 Mon Sep 17 00:00:00 2001 From: Johan Mathe Date: Mon, 18 May 2026 15:46:59 -0700 Subject: [PATCH] Update HuggingFace dataset path from bgbench to ogbench Repoint all references from `geometric-intelligence/bgbench` to `geometric-intelligence/ogbench` so dataset downloads, consistency tests, and the croissant metadata target the renamed HF repo. Co-authored-by: Cursor --- README.md | 2 +- croissant_bgbench.json | 14 +++++++------- ogbench/baseline.py | 4 ++-- scripts/plot_adjacency_threshold_analysis.py | 2 +- scripts/utils.py | 2 +- tests/data/test_dataset_config_consistency.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 3ec0ad85..97777e3b 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ A CLI entry point is also installed: `ogbench-train` (equivalent to `python ogbe ## Datasets -OGBench includes four curated omics datasets for graph-based classification. All are stored on Hugging Face Hub at [`geometric-intelligence/bgbench`](https://huggingface.co/datasets/geometric-intelligence/bgbench) in Parquet format and downloaded automatically on first use. +OGBench includes four curated omics datasets for graph-based classification. All are stored on Hugging Face Hub at [`geometric-intelligence/ogbench`](https://huggingface.co/datasets/geometric-intelligence/ogbench) in Parquet format and downloaded automatically on first use. | Dataset | Domain | Samples | Features | Classes | Task | | --------------- | ------------------------------- | ------- | --------------- | ------- | ----------------------------- | diff --git a/croissant_bgbench.json b/croissant_bgbench.json index ed0c90de..4ceea6a4 100644 --- a/croissant_bgbench.json +++ b/croissant_bgbench.json @@ -54,7 +54,7 @@ "@id": "repo", "name": "repo", "description": "The Hugging Face git repository.", - "contentUrl": "https://huggingface.co/datasets/geometric-intelligence/bgbench/tree/refs%2Fconvert%2Fparquet", + "contentUrl": "https://huggingface.co/datasets/geometric-intelligence/ogbench/tree/refs%2Fconvert%2Fparquet", "encodingFormat": "git+https", "sha256": "https://github.com/mlcommons/croissant/issues/80" }, @@ -72,7 +72,7 @@ { "@type": "cr:RecordSet", "@id": "default", - "description": "geometric-intelligence/bgbench - 'default' subset", + "description": "geometric-intelligence/ogbench - 'default' subset", "field": [ { "@type": "cr:Field", @@ -223655,10 +223655,10 @@ } ], "conformsTo": "http://mlcommons.org/croissant/1.1", - "name": "bgbench", - "description": "OgBench: Benchmarking Graph Neural Networks on Omics Data\n\nOgBench is the first benchmark suite for graph-level prediction in the \nn ≪ p regime characteristic of omics data, where the number of \npatient samples n is much smaller than the number of nodes (genes or \nproteins) p per graph.\n\nDatasets\n\nThis repository contains four preprocessed omics graph classification \ndatasets:\n\nDataset\nModality\nn\np\nTask\n\nHERITAGE\nProteomics\n654\n4,977\nExercise responder (binary)… See the full description on the dataset page: https://huggingface.co/datasets/geometric-intelligence/bgbench.", + "name": "ogbench", + "description": "OgBench: Benchmarking Graph Neural Networks on Omics Data\n\nOgBench is the first benchmark suite for graph-level prediction in the \nn ≪ p regime characteristic of omics data, where the number of \npatient samples n is much smaller than the number of nodes (genes or \nproteins) p per graph.\n\nDatasets\n\nThis repository contains four preprocessed omics graph classification \ndatasets:\n\nDataset\nModality\nn\np\nTask\n\nHERITAGE\nProteomics\n654\n4,977\nExercise responder (binary)… See the full description on the dataset page: https://huggingface.co/datasets/geometric-intelligence/ogbench.", "alternateName": [ - "geometric-intelligence/bgbench", + "geometric-intelligence/ogbench", "OgBench — Omics Graph Benchmark" ], "creator": { @@ -223685,7 +223685,7 @@ "omics" ], "license": "https://choosealicense.com/licenses/cc-by-4.0/", - "url": "https://huggingface.co/datasets/geometric-intelligence/bgbench", + "url": "https://huggingface.co/datasets/geometric-intelligence/ogbench", "rai:dataLimitations": "All datasets contain hundreds of samples (n=100-700), limiting statistical power and generalizability. Data spans only four omics modalities (gene expression, proteomics, methylation, transcriptomics) from specific cohorts and platforms; findings may not transfer to other modalities or instruments. Class imbalance is present across all datasets: HERITAGE (non-responder ≤15%: 279, responder >15%: 375), Parkinson's (MCI/Normal ≥21: 332, Dementia <21: 203), AddNeuroMed (AD: 284, CTL: 238, MCI: 189), BRCA (LumA: 353, LumB: 132, Her2: 42, Basal: 113); BRCA in particular has a severe imbalance between LumA and Her2. Geographic and demographic scope is restricted to the cohorts of each source study. Not recommended for: clinical diagnosis, treatment decisions, or any deployment in a medical setting without independent clinical validation.", "rai:dataBiases": "All four datasets reflect biases inherent to their source cohorts and label construction. AddNeuroMed (GSE63063): cohort is predominantly elderly and of European ancestry (recruited across UK, Greece, Sweden, and Italy), limiting generalisability to other ethnicities or age groups; labels derive from clinical diagnosis which may vary across sites; ambiguous transitional categories (e.g. 'CTL to AD', 'borderline MCI') were removed, potentially discarding informative edge cases and sharpening apparent class boundaries; residual batch effects may remain after ComBat correction across the two Illumina platforms. MoTrPAC/HERITAGE: cohort comprises sedentary adults from the HERITAGE Family Study (US and Canada), enrolled as family units — samples are therefore not fully independent due to shared genetic and environmental backgrounds; the 15% relative ΔVO₂max responder threshold is a researcher-defined dichotomisation of a continuous trait and encodes assumptions about what constitutes a clinically meaningful response; covariates (age, sex, BMI, race) were regressed out prior to release, which reduces confounding but also removes any signal correlated with these variables. Parkinson's (GSE99039): cohort consists exclusively of Parkinson's disease patients, so the MoCA-derived label (≥21 = MCI/Normal, <21 = Dementia) captures cognitive impairment within PD only and does not generalise to other dementias or healthy populations; the MoCA dichotomisation threshold discards continuous score information and may misclassify borderline cases; cohort is likely predominantly of white/European ancestry, consistent with typical neurological research populations. BRCA (TCGA): TCGA cohort is over-represented by patients from major US academic cancer centers and is predominantly non-Hispanic white, which may limit PAM50 subtype prediction performance in under-represented ethnic groups; PAM50 subtype definitions were themselves developed and validated primarily in white/European women; the Normal subtype was removed during preprocessing, so the benchmark cannot evaluate generalisation to normal breast tissue; DNA methylation values carry age-related epigenetic variation (epigenetic clock effects) that may confound subtype classification; class imbalance is marked (LumA: 353, LumB: 132, Her2: 42, Basal: 113 — Her2 is ~8x less frequent than LumA).", "rai:personalSensitiveInformation": "All four datasets contain health and medical data. The following sensitive attribute categories are present: (1) Health/medical data: disease diagnosis labels (Alzheimer's disease, MCI, Parkinson's disease, breast cancer PAM50 subtype), cognitive assessment scores (MoCA), exercise physiology measurements (VO2max), and high-dimensional molecular profiles (gene expression, proteomics, DNA methylation) — present in all datasets. (2) Age: present as a metadata covariate in MoTrPAC/HERITAGE and likely recorded in all cohorts. (3) Sex/Gender: present as an explicit covariate in MoTrPAC/HERITAGE; BRCA is a female-only cohort by disease definition. (4) Race/Ethnicity: present as an explicit covariate in MoTrPAC/HERITAGE. (5) Body composition (BMI): present as a covariate in MoTrPAC/HERITAGE. All data was de-identified by the original data providers prior to public release. No direct PII (names, dates of birth, contact information) is present in any released file. All four source datasets (GEO GSE63063, GEO GSE99039, MoTrPAC HERITAGE, TCGA-BRCA) are publicly accessible without a Data Use Agreement.", @@ -223740,7 +223740,7 @@ "sc:description": "Labels were derived deterministically from clinical metadata or physiological measurements supplied by the original data providers. No additional human annotation was performed for this benchmark; there are no independent annotators and inter-annotator agreement is therefore not applicable. Labels are stored in {dataset}_targets.parquet as integer class indices.\n\n- AddNeuroMed: Multi-class clinical diagnosis (AD, MCI, CTL) as recorded in the GEO sample metadata field '!Sample_characteristics_ch1: status'. Diagnoses were made by clinicians at the recruiting sites following standard criteria (NINCDS-ADRDA for AD, Petersen criteria for MCI). Quality control was performed by the original AddNeuroMed consortium. Transitional or ambiguous diagnostic categories were excluded during preprocessing (see above). Final integer encoding follows alphabetic ordering of class names: AD=0, CTL=1, MCI=2.\n\n- MoTrPAC: Binary exercise-response label computationally derived from VO2max measurements; no human labelling step. Computed as delta_rel = (VO2max_post - VO2max_baseline) / VO2max_baseline. A participant is labelled a responder (1) if delta_rel > 0.15 (i.e. >15% relative improvement), and a non-responder (0) otherwise. VO2max was measured by a standardised graded maximal exercise test at each clinical site.\n\n- Parkinson's: Binary cognitive impairment label computationally derived from Montreal Cognitive Assessment (MoCA) scores recorded in GEO metadata; no human labelling step. Scores >=21 are classified as MCI/Normal (0); scores <21 are classified as Dementia (1).\n\n- BRCA: PAM50 molecular subtype labels as provided in the AIBIC/MLOmics source file (BRCA_label_num.csv), originally assigned by the TCGA project using an RNA-based PAM50 intrinsic subtype classifier. The Normal subtype was excluded during preprocessing. Remaining labels were remapped to: LumA=0, Her2=1, LumB=2, Basal=3. Source numeric label mapping: original {0:LumA, 1:Her2, 2:LumB, 4:Basal} → model {0, 1, 2, 3}." } ], - "citeAs": "@misc{bgbench2026,\n title={OgBench: Benchmarking Graph Neural Networks on Omics Data},\n author={Geometric Intelligence},\n year={2026},\n howpublished={\\url{https://huggingface.co/datasets/geometric-intelligence/bgbench}}\n}", + "citeAs": "@misc{ogbench2026,\n title={OgBench: Benchmarking Graph Neural Networks on Omics Data},\n author={Geometric Intelligence},\n year={2026},\n howpublished={\\url{https://huggingface.co/datasets/geometric-intelligence/ogbench}}\n}", "datePublished": "2025-05-01", "version": "1.0.0", "rai:dataCollection": "Data was collected from four publicly available biomedical repositories: GEO (GSE63063, GSE99039), MoTrPAC, and TCGA via AIBIC/MLOmics on HuggingFace. No new human data was collected. All source datasets are de-identified and publicly accessible." diff --git a/ogbench/baseline.py b/ogbench/baseline.py index d82410d5..851fd346 100644 --- a/ogbench/baseline.py +++ b/ogbench/baseline.py @@ -176,7 +176,7 @@ def load_metadata(data_name: str, cfg: DictConfig) -> dict[str, Any] | None: # Download from HuggingFace try: logger.info('Downloading metadata from HuggingFace...') - hf_repo_id = 'geometric-intelligence/bgbench' + hf_repo_id = 'geometric-intelligence/ogbench' revision = cfg.dataset.loader.parameters.get('revision', '3abc196') metadata_file = hf_hub_download( # nosec @@ -288,7 +288,7 @@ def load_and_prepare_data(cfg: DictConfig) -> DatasetContainer: # Download from HuggingFace logger.info('Downloading from HuggingFace...') - hf_repo_id = 'geometric-intelligence/bgbench' + hf_repo_id = 'geometric-intelligence/ogbench' revision = cfg.dataset.loader.parameters.get('revision', '3abc196') data_file = hf_hub_download( # nosec diff --git a/scripts/plot_adjacency_threshold_analysis.py b/scripts/plot_adjacency_threshold_analysis.py index a7e7e6de..3b8d16c6 100755 --- a/scripts/plot_adjacency_threshold_analysis.py +++ b/scripts/plot_adjacency_threshold_analysis.py @@ -56,7 +56,7 @@ def load_dataset_config(dataset_name: str) -> dict[str, Any]: METHODS = ['variance', 'random', 'correlation', 'distance_correlation'] NODE_SAMPLE_RATIOS = [1.0, 0.8, 0.5, 0.3] -HF_REPO_ID = 'geometric-intelligence/bgbench' +HF_REPO_ID = 'geometric-intelligence/ogbench' def load_and_preprocess_dataset( diff --git a/scripts/utils.py b/scripts/utils.py index 7b926cf3..66526732 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -79,7 +79,7 @@ def upload_to_huggingface( """ try: api = huggingface_hub.HfApi() - repo_id = 'geometric-intelligence/bgbench' + repo_id = 'geometric-intelligence/ogbench' # Create repository if it doesn't exist try: diff --git a/tests/data/test_dataset_config_consistency.py b/tests/data/test_dataset_config_consistency.py index 2c97984f..5f8eeab6 100644 --- a/tests/data/test_dataset_config_consistency.py +++ b/tests/data/test_dataset_config_consistency.py @@ -15,7 +15,7 @@ HF_CONFIG_PATH = Path(__file__).resolve().parents[2] / 'configs' / 'hf' / 'default.yaml' DATASET_CONFIGS = sorted(DATASET_CONFIG_DIR.glob('*.yaml')) -HF_REPO_ID = 'geometric-intelligence/bgbench' +HF_REPO_ID = 'geometric-intelligence/ogbench' def _load_config(path: Path) -> dict: