diff --git a/.Rbuildignore b/.Rbuildignore index 63b5dd5..dd2e70d 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,9 @@ ^data-raw$ ^doc$ ^Meta$ +^\.positai$ +^\.claude$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +^\.github$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..16c6c8b --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,47 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + release: + types: [published] + workflow_dispatch: + +name: pkgdown.yaml + +permissions: read-all + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v6 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@d92aa235d04922e8f08b40ce78cc5442fcfbfa2f # v4.8.0 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore index 9eb58ce..c57affe 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ inst/doc /doc/ /Meta/ +.positai +docs diff --git a/DESCRIPTION b/DESCRIPTION index deb9202..690dee0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,6 +47,6 @@ Config/testthat/edition: 3 Depends: R (>= 4.0) LazyData: true -URL: https://github.com/ajxa/CellVoteR +URL: https://github.com/ajxa/CellVoteR, https://ajxa.github.io/CellVoteR/ BugReports: https://github.com/ajxa/CellVoteR/issues VignetteBuilder: knitr diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..256b8d5 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,4 @@ +url: https://ajxa.github.io/CellVoteR/ +template: + bootstrap: 5 + diff --git a/pkgdown/favicon/apple-touch-icon.png b/pkgdown/favicon/apple-touch-icon.png new file mode 100644 index 0000000..3f190e5 Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon.png differ diff --git a/pkgdown/favicon/favicon-96x96.png b/pkgdown/favicon/favicon-96x96.png new file mode 100644 index 0000000..8c60f8e Binary files /dev/null and b/pkgdown/favicon/favicon-96x96.png differ diff --git a/pkgdown/favicon/favicon.ico b/pkgdown/favicon/favicon.ico new file mode 100644 index 0000000..0fed017 Binary files /dev/null and b/pkgdown/favicon/favicon.ico differ diff --git a/pkgdown/favicon/favicon.svg b/pkgdown/favicon/favicon.svg new file mode 100644 index 0000000..ae86acf --- /dev/null +++ b/pkgdown/favicon/favicon.svg @@ -0,0 +1 @@ +RealFaviconGeneratorhttps://realfavicongenerator.net \ No newline at end of file diff --git a/pkgdown/favicon/site.webmanifest b/pkgdown/favicon/site.webmanifest new file mode 100644 index 0000000..4ebda26 --- /dev/null +++ b/pkgdown/favicon/site.webmanifest @@ -0,0 +1,21 @@ +{ + "name": "", + "short_name": "", + "icons": [ + { + "src": "/web-app-manifest-192x192.png", + "sizes": "192x192", + "type": "image/png", + "purpose": "maskable" + }, + { + "src": "/web-app-manifest-512x512.png", + "sizes": "512x512", + "type": "image/png", + "purpose": "maskable" + } + ], + "theme_color": "#ffffff", + "background_color": "#ffffff", + "display": "standalone" +} \ No newline at end of file diff --git a/pkgdown/favicon/web-app-manifest-192x192.png b/pkgdown/favicon/web-app-manifest-192x192.png new file mode 100644 index 0000000..7a47874 Binary files /dev/null and b/pkgdown/favicon/web-app-manifest-192x192.png differ diff --git a/pkgdown/favicon/web-app-manifest-512x512.png b/pkgdown/favicon/web-app-manifest-512x512.png new file mode 100644 index 0000000..47dbae9 Binary files /dev/null and b/pkgdown/favicon/web-app-manifest-512x512.png differ diff --git a/renv.lock b/renv.lock index 9d12566..217b78e 100644 --- a/renv.lock +++ b/renv.lock @@ -498,14 +498,14 @@ }, "cli": { "Package": "cli", - "Version": "3.6.5", + "Version": "3.6.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "utils" ], - "Hash": "16850760556401a2eeb27d39bd11c9cb" + "Hash": "a73d822b669d443ff8de6928f9c49850" }, "cluster": { "Package": "cluster", @@ -899,14 +899,14 @@ }, "rlang": { "Package": "rlang", - "Version": "1.1.7", + "Version": "1.2.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "utils" ], - "Hash": "34c0d101f4613098abc538b82e0d86c5" + "Hash": "f88151fb9ca15e72dc351deb1328716e" }, "rsvd": { "Package": "rsvd", diff --git a/vignettes/CellVoteR.Rmd b/vignettes/CellVoteR.Rmd index 4b369e3..7481c0c 100644 --- a/vignettes/CellVoteR.Rmd +++ b/vignettes/CellVoteR.Rmd @@ -3,13 +3,15 @@ title: "CellVoteR: Ensemble Cell Type Annotation for Single-Cell RNA-seq" author: "Shoaib Ajaib" date: "`r Sys.Date()`" output: - rmarkdown::html_vignette: - toc: true - toc_depth: 3 - fig_width: 7 - fig_height: 5 + rmdformats::downcute: + self_contained: true + default_style: "light" + downcute_theme: "default" + code_folding: show + toc_depth: 4 + toc_float: true vignette: > - %\VignetteIndexEntry{CellVoteR: Ensemble Cell Type Annotation for Single-Cell RNA-seq} + %\VignetteIndexEntry{CellVoteR: Ensemble cell type annotation for single-cell RNA-seq} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -21,8 +23,7 @@ knitr::opts_chunk$set( eval = FALSE ) ``` - -## Overview +# Overview CellVoteR is an ensemble-based pipeline for robust cell type annotation in single-cell RNA-seq (scRNA-seq) data. Rather than relying on a single @@ -35,27 +36,28 @@ The core design philosophy is: - **Divide and conquer** — broadly triage cells into lineages before applying fine-resolution annotation, preventing dominant populations from masking rare cell types + - **Redundancy** — four methods running in parallel reduces sensitivity to the failure modes of any single approach + - **Separation of concerns** — annotation (slow, compute-intensive) and consensus resolution (fast, parameter-sensitive) are decoupled, so the user can re-tune voting without repeating the pipeline -### What CellVoteR requires -Two inputs are needed before running the pipeline: +In order to run the complete workflow, CellVoteR requires two inputs to be supplied: -1. A **raw gene-by-cell counts matrix** (sparse `dgCMatrix`, RDS file, or MTX - triplet) -2. A **marker configuration** — a structured list of broad and fine cell type - marker genes, loaded from an Excel workbook or nested YAML file +1. A **raw gene-by-cell counts matrix** (sparse `dgCMatrix`, `RDS`, or `MTX triplet` file). + +2. A **marker configuration** — a structured list of broad and fine cell type marker genes. --- -## Installation +# Installation + +Currently, the package can be installed directly from Github: ```{r install} -# Install the development version from GitHub # install.packages("devtools") devtools::install_github("ajxa/CellVoteR") ``` @@ -66,38 +68,59 @@ library(CellVoteR) --- -## Step 1 — Preparing Marker Inputs +# Detailed Pipeline Steps + +## Step 1: Preparing Marker Inputs -Markers are the backbone of CellVoteR's annotation strategy. They are -organised into two tiers: +Markers are the backbone of CellVoteR's annotation strategy. They are organised into two tiers: -### Broad markers +### Broad markers (lineage-specific) Broad markers define coarse cell lineages (e.g. *Immune*, *Vasculature*, -*Other*). They must be: +*Other*). Therefore, they must be: + +- **Small** — typically 2–5 genes per category. + +- **Mutually exclusive** — no gene should appear in more than one broad category. -- **Small** — typically 2–5 genes per category -- **Mutually exclusive** — no gene should appear in more than one broad - category -- **Biologically diagnostic** — genes that robustly delineate lineages even - in heterogeneous datasets +- **Biologically diagnostic** — genes that robustly delineate lineages even in heterogeneous datasets. -Broad markers are loaded and then configured with `build_broad_marker_config()`, +These broad markers are loaded and then configured with `build_broad_marker_config()`, which assigns expression thresholds and priority rankings used for tie-breaking when a cell passes multiple broad categories. -### Fine markers +### Fine markers (cell-type specific) Fine markers define sub-populations within each broad lineage (e.g. *B cell*, *T cell*, *NK cell* within *Immune*). They can be larger gene sets and are -used for Fisher's Exact Test scoring during fine annotation. They do not need -to be mutually exclusive. +used for Fisher's Exact Test scoring during fine annotation. These marker do not need +to be mutually exclusive, but should sufficiently distinguish between to cell types from a common +lineage, e.g, T cells vs B cells and Mural cells vs Endothelial cells. + ### Loading markers -Markers are loaded from an Excel workbook (`.xlsx`) or a nested YAML file -using `load_markers()`. The expected structure has two sheets or top-level -keys: `broad` and `fine`. +User-supplied markers can be loaded from either `Excel`, `CSV`, or `TXT` files. The files must be structured to comprise four columns: `type` (broad/fine), `category`, `label`, and `marker`: + + +| type | category | label | marker | +|---|---|---|---| +| broad | immune | | PTPRC | +| broad | vasculature | | CDH5 | +| broad | vasculature | | VWF | +| fine | immune | T cell | CD2 | +| fine | immune | T cell | CD3D | +| fine | immune | T cell | IL32 | +| fine | immune | B cell | CD79A | +| fine | immune | B cell | CD79B | +| fine | vasculature | Mural cell | IGFBP7 | +| fine | vasculature | Mural cell | FN1 | +| fine | vasculature | Endothelial | A2M | +| fine | vasculature | Endothelial | IGFBP7 | + + +> When defining broad category markers, leave the **label** field blank. For fine cell type markers within that broad category, assign a **label**. + ```{r load-markers} markers <- load_markers(file_path = "path/to/input_markers.xlsx") @@ -124,19 +147,17 @@ markers$broad <- build_broad_marker_config( str(markers$broad$immune) ``` -The `priority_order` argument controls tie-breaking when a cell passes -expression thresholds for more than one broad category — categories listed -earlier receive a lower (higher priority) numeric rank. +> The **priority_order** argument controls tie-breaking when a cell passes expression thresholds for more than one broad category - categories listed earlier receive a lower (higher priority) numeric rank. --- -## Step 2 — Creating the SingleCellExperiment +## Step 2: Object Creation & QC CellVoteR works natively with -[`SingleCellExperiment`](https://bioconductor.org/packages/SingleCellExperiment/) +[*SingleCellExperiment*](https://bioconductor.org/packages/SingleCellExperiment/) objects. Use `create_sce()` to construct one from your raw data. -### From an in-memory sparse matrix +### From (in-memory) sparse matrix ```{r create-sce-memory} sce <- create_sce( @@ -145,7 +166,7 @@ sce <- create_sce( ) ``` -### From file paths +### From file path `create_sce()` also accepts file paths, which is useful for large datasets where the matrix is stored on disk: @@ -167,7 +188,7 @@ sce <- create_sce( --- -## Step 3 — Quality Control +## Step 3: Quality Control `assess_cell_quality()` calculates per-cell QC metrics and optionally removes low-quality cells before downstream analysis. @@ -176,12 +197,11 @@ low-quality cells before downstream analysis. sce <- assess_cell_quality(sce, remove_failed_cells = TRUE) ``` -Cells that fail QC are flagged in `colData(sce)$QC_PASS`. Setting -`remove_failed_cells = TRUE` subsets the object to passing cells only. +> Cells that fail QC are flagged in **colData(sce)$QC_PASS**. Setting **remove_failed_cells = TRUE** subsets the object to passing cells only. --- -## Step 4 — Normalisation +## Step 4: Normalisation CellVoteR uses a pooling-based normalisation strategy ([Lun et al. 2016](https://doi.org/10.1186/s13059-016-0947-7)) via @@ -192,40 +212,42 @@ CellVoteR uses a pooling-based normalisation strategy sce <- normalize_counts(sce) ``` -This adds a `logcounts` assay to the SCE and sets `sizeFactors()`. The -`logcounts` assay is required by all downstream steps. +> This adds a *logcounts* assay to the SCE and sets `sizeFactors()`. The *logcounts* assay is required by all downstream steps. --- -## Step 5 — Building Analysis Tracks +## Step 5: Building Analysis Tracks + +The `prepare_sce()` function co-ordinates the key preprocessing step. This function performs/applies, +the following calculations and logic: + +1. Validates broad and fine marker configurations against the expression matrix. -`prepare_sce()` is the key preprocessing step. It: +2. Builds two independent feature spaces — the **full HVG space** and the **reduced marker-defined space**. + +3. Runs PCA and unsupervised clustering (Leiden via SNN graph) on each space. -1. Validates broad and fine marker configurations against the expression matrix -2. Builds two independent feature spaces — the **full HVG space** and the - **reduced marker-defined space** -3. Runs PCA and unsupervised clustering (Leiden via SNN graph) on each space -4. Attaches the marker configuration and filtered fine markers to the SCE - metadata -5. Stores the reduced feature space as an `altExp` named `"user_panel"` +4. Attaches the marker configuration and filtered fine markers to the SCE metadata. + +5. Stores the reduced feature space as an **altExp** named *"user_panel"*. ```{r prepare} sce <- prepare_sce(sce, markers) ``` -After this step, the SCE has the following structure: +After this step, the initial SingleCellExepriment object structure is as follows: ``` sce ├── assays: counts, logcounts -├── rowSubset("broad_hvgs") ← HVGs used for broad clustering -├── reducedDim("PCA_broad_hvg") ← PCA on full HVG space -├── colData$cluster_broad_hvg ← Leiden clusters (full space) -├── colData$cluster_broad_reduced ← Leiden clusters (reduced space) -├── metadata$marker_config ← full marker configuration -├── metadata$filtered_fine_markers ← fine markers present in data -├── metadata$missing_by_label ← per-label missing marker report -└── altExp("user_panel") ← reduced feature SCE +├── rowSubset("broad_hvgs") ← HVGs used for broad clustering +├── reducedDim("PCA_broad_hvg") ← PCA on full HVG space +├── colData$cluster_broad_hvg ← Leiden clusters (full space) +├── colData$cluster_broad_reduced ← Leiden clusters (reduced space) +├── metadata$marker_config ← full marker configuration +├── metadata$filtered_fine_markers ← fine markers present in data +├── metadata$missing_by_label ← per-label missing marker report +└── altExp("user_panel") ← reduced feature SCE ├── assays: counts, logcounts ├── reducedDim("PCA") ├── colData$cluster @@ -238,8 +260,7 @@ sce Clustering parameters (number of PCs, SNN neighbourhood size $k$, Leiden resolution) are estimated automatically from cell count using bounded -log/sqrt scaling via `estimate_cluster_params()`. You can override these if -needed: +$log/sqrt$ scaling via `estimate_cluster_params()`. However, these parameters can be overridden if required: ```{r prepare-override} sce <- prepare_sce( @@ -254,29 +275,37 @@ sce <- prepare_sce( ### Marker overlap reporting -If fine markers are partially missing from your dataset, `prepare_sce()` -reports which labels are most affected: +In some cases, you may have fine markers that are partially missing from your dataset: the `prepare_sce()` function captures this information and details the labels that are most affected. which can be inspected as follows: ```{r missing-markers} # Inspect missing marker report after prepare_sce() metadata(sce)$missing_by_label ``` -Missing markers are automatically removed from the fine marker sets used for -scoring, so annotation still proceeds with the genes that are present. +Missing (in the data) marker are automatically removed from the fine marker sets used for scoring, so annotation still proceeds with the genes that are present. This only occurs when the total number of missing markers is <50% of the total distinct markers supplied - this parameter can be relaxed or tightened accordingly, by altering the `overlap_feat_percent` argument: + +```{r override-marker-overlap} + +prepare_sce( + sce, + markers, + overlap_feat_percent = 75 # more stringent +) +``` + --- -## Step 6 — Running the Ensemble +## Step 6: Annotation Methods -`run_cellvoter()` orchestrates all six annotation pipelines and returns a -named list of per-cell label factors, one per method. +The `run_cellvoter()` orchestrates all of the individual annotation pipelines and returns a +named list of per-cell label factors, one per method: ```{r run} results <- run_cellvoter(sce) ``` -### What happens internally +### What this does internally Four primary methods and two global tie-breakers are run: @@ -296,20 +325,20 @@ broad annotation ↓ subcluster_labels() ↓ -rank_cluster_markers() ← DE testing per sub-cluster +rank_cluster_markers() ← DE testing per sub-cluster ↓ -extract_top_markers() ← select top N genes per cluster +extract_top_markers() ← select top N genes per cluster ↓ -score_markers_against_panel() ← Fisher's Exact Test + overlap similarity +score_markers_against_panel() ← Fisher's Exact Test + overlap similarity ↓ -assign_fine_labels() ← best label per cluster → mapped to cells +assign_fine_labels() ← best label per cluster → mapped to cells ``` ### Broad annotation strategies **Cluster-based** (`annotate_broad_clusters`): Runs DE testing on pre-existing unsupervised clusters. Each cluster is assigned the broad category whose -curated markers have the lowest median rank among significantly upregulated +curated markers have the lowest median rank among significantly up-regulated genes (FDR ≤ 0.05, AUC ≥ 0.6 by default). **Enrichment-based** (`annotate_broad_cells`): Assigns labels directly to @@ -317,36 +346,51 @@ individual cells by aggregating expression across each broad category's marker genes and comparing against category-specific thresholds. Does not depend on clustering. -### Edge case — collapsed broad labels - -In some datasets (e.g. a highly homogeneous tumour sample), all clusters may +> In some datasets (highly homogeneous tumour sample), all clusters may receive the same broad label. CellVoteR detects this and retains the original cluster structure for subclustering rather than collapsing to a single group, -preserving fine-resolution information downstream. +preserving the fine-resolution information for downstream processes. -### Returned colData columns +### Annotation Method Results -After `run_cellvoter()`, the returned `$sce` has all intermediate cluster -labels attached: +After running `run_cellvoter()`, the returned SingleCellExeriment object is populated with all of th intermediate cluster labels, which can be accessed by querying the colData() columns of the object: | Column | Description | |---|---| -| `cluster_broad_hvg` | Pre-existing HVG clusters from `prepare_sce()` | -| `cluster_broad_reduced` | Pre-existing reduced clusters from `prepare_sce()` | -| `broad_cluster_m1` | Broad labels, method 1 | -| `broad_cluster_sub_m1` | Sub-cluster labels, method 1 | -| `broad_cluster_m2` | Broad labels, method 2 | -| `broad_cluster_sub_m2` | Sub-cluster labels, method 2 | -| `broad_enrichment_m3` | Broad labels, method 3 | -| `broad_enrichment_sub_m3` | Sub-cluster labels, method 3 | -| `broad_enrichment_m4` | Broad labels, method 4 | -| `broad_enrichment_sub_m4` | Sub-cluster labels, method 4 | - -### Customising annotation parameters - -`run_cellvoter()` accepts an `annotation_args` list to customise the -underlying functions. There are two independent ranking steps, each with its -own sublist: +| **cluster_broad_hvg** | Pre-existing HVG clusters from `prepare_sce()` | +| **cluster_broad_reduced** | Pre-existing reduced clusters from `prepare_sce()` | +| **broad_cluster_m1** | Broad labels, method 1 | +| **broad_cluster_sub_m1** | Sub-cluster labels, method 1 | +| **broad_cluster_m2** | Broad labels, method 2 | +| **broad_cluster_sub_m2** | Sub-cluster labels, method 2 | +| **broad_enrichment_m3** | Broad labels, method 3 | +| **broad_enrichment_sub_m3** | Sub-cluster labels, method 3 | +| **broad_enrichment_m4** | Broad labels, method 4 | +| **broad_enrichment_sub_m4** | Sub-cluster labels, method 4 | + +### Customising parameters + +The main `run_cellvoter()` function accepts an `annotation_args` list which can be used to customise various parameters of the underlying internal functions, if required. There following parameter lists that can be specified, which each control specific aspects of the underlying logic: + + +- **rank_args** Controls parameters associated with `rank_cluster_markers()`: + - assay_type + - test_type + - direction + - pval_type + - min_prop + - BPPARAM + +- **broad_args** is identical to the `rank_args`, but only controls the ranking inside the +`annotate_broad_clusters` function. This is useful for altering the parameters which control how the +broad cell lineage labels (e.g, immune, vasculature) are defined - these broad markers are small and highly specific and so you may wish to use a more lenient `min_prop` compared to the fine labels and this allows for such fine control of the process. + +- **extract_args** Controls parameters associated with `extract_top_markers()`: + - fdr_threshold + - effect_threshold + - target_n + +An following is an example of how this can be used to independently alter the broad and fine labelling logic: ```{r run-custom} results <- run_cellvoter( @@ -378,17 +422,12 @@ results <- run_cellvoter( ) ``` -> **Note on `broad_args` vs `rank_args`:** Both control -> `rank_cluster_markers()` but at different pipeline stages. -> `broad_args` affects how broad lineage labels are assigned (methods 1 and 2 -> only); `rank_args` affects how sub-cluster marker genes are extracted for -> Fisher scoring (all six methods). Methods 3 and 4 use `rank_args` only, -> as the enrichment-based broad step does not call `rank_cluster_markers()`. +> **broad_args** vs **rank_args**: these both control `rank_cluster_markers()` but at different pipeline stages. *broad_args* affects how broad lineage labels are assigned (methods 1 and 2 only); *rank_args* affects how sub-cluster marker genes are extracted for Fisher scoring (all six methods). Methods 3 and 4 use *rank_args* only, as the enrichment-based broad step does not call *rank_cluster_markers()*. ### Accessing full outputs -When `return_full_output = TRUE`, per-cluster Fisher scores and similarity -values are available for every method: +> Setting `return_full_output` = **TRUE**, returns the per-cluster Fisher scores and similarity +values are for every method (by default this is set to *FALSE*). ```{r full-output} # Per-cluster score table for method 1 @@ -400,7 +439,7 @@ results$full_output$global_2$scores --- -## Step 7 — Resolving Consensus Labels +## Step 7: Resolving Consensus Labels Consensus resolution is intentionally a separate step. This means you can adjust voting parameters and re-run `resolve_consensus_labels()` as many @@ -443,9 +482,9 @@ For each cell, the following logic is applied in order: | Parameter | Default | Effect | |---|---|---| -| `allow_even_split` | `FALSE` | When `TRUE`, a 2-of-4 plurality is accepted as majority | -| `ordered_tiebreak` | `TRUE` | When `FALSE`, either tie-breaker can resolve a split | -| `unassigned_label` | `"Unknown"` | Label for unresolved cells | +| **allow_even_split** | **FALSE** | When **TRUE**, a 2-of-4 plurality is accepted as majority | +| **ordered_tiebreak** | **TRUE** | When **FALSE**, either tie-breaker can resolve a split | +| **unassigned_label** | **"Unknown"** | Label for unresolved cells | ### Attaching labels to the SCE @@ -456,7 +495,7 @@ sce$cellVoteR_method <- consensus$method --- -## Step 8 — Inspecting Results +## Step 8: Inspecting Results ### Summary tables @@ -503,7 +542,7 @@ table(label_df$n_agree) ### Re-running consensus with different parameters ```{r re-resolve} -# Try allowing even splits +# allowing even splits consensus_liberal <- resolve_consensus_labels( label_list = results$labels, method_names = results$method_names, @@ -521,7 +560,7 @@ mean(consensus_liberal$label == "unknown") --- -## Complete Workflow +# Complete Workflow The full pipeline from raw data to annotated SCE: @@ -574,61 +613,9 @@ table(sce$cellVoteR_method) --- -## Marker File Format - -### Excel workbook - -The expected workbook has two sheets: - -**Sheet: `broad`** - -| category | gene | -|---|---| -| immune | CD45 | -| immune | PTPRC | -| vasculature | PECAM1 | -| vasculature | CDH5 | - -**Sheet: `fine`** - -| broad_category | cell_type | gene | -|---|---|---| -| immune | B cell | CD79A | -| immune | B cell | CD79B | -| immune | T cell | CD3D | -| immune | T cell | IL32 | -| vasculature | Endothelial | CLDN5 | - -### YAML format - -```yaml -broad: - immune: - - CD45 - - PTPRC - vasculature: - - PECAM1 - - CDH5 - -fine: - immune: - B cell: - - CD79A - - CD79B - T cell: - - CD3D - - IL32 - vasculature: - Endothelial: - - CLDN5 - - SPARC -``` - ---- - -## Tips and Troubleshooting +# Tips and Troubleshooting -### Low marker overlap +## Low marker overlap If `prepare_sce()` warns about low fine marker overlap, inspect which labels are most affected: @@ -661,7 +648,7 @@ consensus <- resolve_consensus_labels( table(results$labels$method_1, results$labels$method_2) ``` -### Collapsed broad labels +## Collapsed broad labels If all clusters receive the same broad label, CellVoteR retains the original cluster structure automatically. This is expected behaviour for highly @@ -670,7 +657,7 @@ In this case, the numeric cluster prefixes (e.g. `1_sc1`, `2_sc1`) trigger testing against the full fine marker panel rather than a lineage-specific subset. -### Large datasets +## Large datasets For datasets exceeding available RAM, convert the SCE to HDF5-backed storage after `create_sce()`: @@ -680,7 +667,7 @@ HDF5Array::saveHDF5SummarizedExperiment(sce, dir = "my_hdf5_sce") sce <- HDF5Array::loadHDF5SummarizedExperiment("my_hdf5_sce") ``` -### Parallelisation +## Parallelisation Key functions accept a `BPPARAM` argument for parallelisation via [`BiocParallel`](https://bioconductor.org/packages/BiocParallel/): @@ -699,8 +686,8 @@ results <- run_cellvoter( --- -## Session Info +# Session Info -```{r session} -sessionInfo() +```{r session, echo=FALSE, eval=TRUE} +print(sessionInfo()) ```