nf-core · Ositofeliz · Apr 7, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 20, 2026
diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml
@@ -68,7 +68,7 @@ runs:
           --changed-since HEAD^ \
           --verbose \
           --tap=test.tap \
-          --shard ${{ inputs.shard }}/${{ inputs.total_shards }}
+          --shard ${{ inputs.shard }}/${{ inputs.total_shards }} --debug
 
           # Save the absolute path of the test.tap file to the output
           echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT

diff --git a/.prettierignore b/.prettierignore
@@ -16,3 +16,4 @@ modules/nf-core/
 subworkflows/nf-core/
 galaxy/
 docs/
+tests/act
diff --git a/README.md b/README.md
@@ -50,15 +50,15 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - Get NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) **microarray** dataset accessions corresponding to the provided species (and optionally keywords)
   This is optional and **NOT** run by default. Set `--fetch_geo_accessions` to run it.
 
-#### 2. Download data (see [usage](conf/usage.md#3-provide-your-own-accessions))
+#### 2. Download data (see [usage](./conf/usage.md#3-provide-your-own-accessions))
 
 - Download [Expression Atlas](https://www.ebi.ac.uk/gxa/home) data if any
 - Download NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) data if any
 
 > [!NOTE]
-> At this point, datasets downloaded from public databases are merged with datasets provided by the user using the `--datasets` parameter. See [usage](conf/usage.md#4-use-your-own-expression-datasets) for more information about local datasets.
+> At this point, datasets downloaded from public databases are merged with datasets provided by the user using the `--datasets` parameter. See [usage](./conf/usage.md#4-use-your-own-expression-datasets) for more information about local datasets.
 
-#### 3. ID Mapping (see [usage](conf/usage.md#5-custom-gene-id-mapping--metadata))
+#### 3. ID Mapping (see [usage](./conf/usage.md#5-custom-gene-id-mapping--metadata))
 
 - Gene IDs are cleaned
 - Map gene IDS to NCBI Entrez Gene IDS (or Ensembl IDs) for standardisation among datasets using [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) (run by default; optional)
@@ -99,6 +99,14 @@ Base statistics are computed for each gene, platform-wide and for each platform
 - Make [`MultiQC`](http://multiqc.info/) report
 - Prepare [Dash Plotly](https://dash.plotly.com/) app for further investigation of gene / sample counts
 
+## Test pipeline
+
+You can test the execution of the pipeline locally with:
+
+```bash
+nextflow run nf-core/stableexpression -profile test,<docker/apptainer/conda/micromamba/...>
+```
+
 ## Basic usage
 
 > [!NOTE]
@@ -125,7 +133,7 @@ please refer to the [usage documentation](https://nf-co.re/stableexpression/usag
 
 ## Resource allocation
 
-For setting pipeline CPU / memory usage, see [here](docs/configuration.md).
+For setting pipeline CPU / memory usage, see [here](./docs/configuration.md).
 
 ## Profiles
 
@@ -150,6 +158,8 @@ nf-core/stableexpression was originally written by Olivier Coen.
 We thank the following people for their assistance in the development of this pipeline:
 
 - Rémy Costa
+- Shaheen Acheche
+- Janine Soares
 
 ## Contributions and Support
 

diff --git a/...stom_content_multiqc_config.template.yaml → ...ltiqc_config.custom_content.template.yaml b/...stom_content_multiqc_config.template.yaml → ...ltiqc_config.custom_content.template.yaml
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -16,7 +16,7 @@ run_modules:
 
 disable_version_detection: true
 
-max_table_rows: 5000
+max_table_rows: 100000
 
 table_cond_formatting_colours:
   - first: "#ffd700"

diff --git a/bin/aggregate_results.py b/bin/aggregate_results.py
@@ -290,8 +290,10 @@ def search_target_genes(df: pl.DataFrame, target_genes: list[str]) -> list[dict]
         )
         unique_gene_ids |= set(original_gene_ids)
 
+    # putting all unique gene IDs, gene names and original gene IDs into single list
     all_unique_gene_ids = [gene for gene in unique_gene_ids if gene is not None]
 
+    # formatting all gene IDs found
     formated_gene_ids_df = pl.DataFrame({"gene": all_unique_gene_ids}).with_columns(
         pl.col("gene")
         .map_batches(
@@ -301,6 +303,7 @@ def search_target_genes(df: pl.DataFrame, target_genes: list[str]) -> list[dict]
         .alias("formatted_gene")
     )
 
+    # formatting target genes
     formated_target_genes_df = pl.DataFrame({"target_gene": target_genes}).with_columns(
         pl.col("target_gene")
         .map_batches(
@@ -315,6 +318,7 @@ def search_target_genes(df: pl.DataFrame, target_genes: list[str]) -> list[dict]
             formated_target_genes_df, on="formatted_gene", how="inner"
         )
         .select(["target_gene", "gene"])
+        .sort("target_gene")
         .to_dicts()
     )
 

diff --git a/bin/compute_dataset_statistics.py b/bin/compute_dataset_statistics.py
@@ -14,6 +14,7 @@
 logger = logging.getLogger(__name__)
 
 KEY_TO_OUTFILE = {"skewness": "skewness.txt"}
+FLOAT_PRECISION = 6
 
 
 #####################################################
@@ -39,6 +40,10 @@ def compute_dataset_statistics(df: pl.DataFrame) -> dict:
     return dict(skewness=list(skewness))
 
 
+def format_value(value: float) -> str:
+    return f"{value:.{FLOAT_PRECISION}f}" if value != 0 else "0"
+
+
 def export_count_data(stats: dict):
     """
     Export dataset statistics to CSV files.
@@ -47,7 +52,7 @@ def export_count_data(stats: dict):
     for key, outfile_name in KEY_TO_OUTFILE.items():
         logger.info(f"Exporting dataset statistics {key} to: {outfile_name}")
         with open(outfile_name, "w") as outfile:
-            outfile.write(",".join([str(val) for val in stats[key]]))
+            outfile.write(",".join([format_value(val) for val in stats[key]]))
 
 
 #####################################################

diff --git a/bin/compute_gene_statistics.py b/bin/compute_gene_statistics.py
@@ -130,9 +130,11 @@ def compute_ratios_null_values(
     # the samples showing a low gene count will not be taken into account for the zero count penalty
     nb_nulls = df.select(pl.exclude(config.GENE_ID_COLNAME).is_null()).sum_horizontal()
 
-    if valid_samples:
+    found_valid_samples = [sample for sample in valid_samples if sample in df.columns]
+
+    if found_valid_samples:
         nb_nulls_valid_samples = df.select(
-            pl.col(valid_samples).is_null()
+            pl.col(found_valid_samples).is_null()
         ).sum_horizontal()
     else:
         nb_nulls_valid_samples = nb_nulls
@@ -143,7 +145,7 @@ def compute_ratios_null_values(
         (nb_nulls / nb_samples).alias(
             get_colname(config.RATIO_NULLS_COLNAME, platform)
         ),
-        (nb_nulls_valid_samples / len(valid_samples)).alias(
+        (nb_nulls_valid_samples / len(found_valid_samples)).alias(
             get_colname(config.RATIO_NULLS_VALID_SAMPLES_COLNAME, platform)
         ),
     )

diff --git a/bin/detect_rare_genes.py b/bin/detect_rare_genes.py
@@ -106,10 +106,13 @@ def main():
         .unique()
     )
 
+    # sorting (for output consistency)
+    df = df.sort(["total_occurrences_quantile", "gene_id"], descending=[True, False])
+
     # writing total occurrences in a csv before filtering
-    df.select([config.GENE_ID_COLNAME, "total_occurrences_quantile"]).sort(
-        "total_occurrences_quantile", descending=True
-    ).write_csv(TOTAL_OCCURRENCES_OUTFILE)
+    df.select([config.GENE_ID_COLNAME, "total_occurrences_quantile"]).write_csv(
+        TOTAL_OCCURRENCES_OUTFILE
+    )
 
     # filtering genes
     valid_gene_ids = (

diff --git a/bin/download_eatlas_data.R b/bin/download_eatlas_data.R
@@ -66,6 +66,10 @@ download_expression_atlas_data_with_retries <- function(accession, max_retries =
                     warning(w$message)
                     write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE)
                     quit(save = "no", status = 0)
+                } else if (grepl("FTP status was", w$message)) {
+                    warning(w$message)
+                    write("FTP ERROR", file = FAILURE_REASON_FILE)
+                    quit(save = "no", status = 101)
                 } else {
                     warning("Unhandled warning: ", w$message)
                     write("UNKNOWN ERROR", file = FAILURE_REASON_FILE)

diff --git a/bin/download_geo_data.R b/bin/download_geo_data.R
@@ -379,6 +379,26 @@ get_microarray_counts <- function(platform) {
   return(counts)
 }
 
+parse_first_line <- function(filename, sep){
+    tryCatch({
+        counts <- read.table(filename, header = FALSE, sep = sep, row.names = 1, nrows = 1)
+        return(counts)
+    }, error = function(e) {
+        write_warning(paste("ERROR PARSING FIRST LINE IN", filename))
+        return(NULL)
+    })
+}
+
+download_file <- function(data_url, filename){
+    tryCatch({
+        download.file(data_url, filename, method = "wget", quiet = TRUE)
+        return("SUCCESS")
+    }, error = function(e) {
+        write_warning(paste("ERROR WHILE DOWNLOADING:", filename))
+        return("FAILURE")
+    })
+}
+
 
 get_raw_counts_from_url <- function(data_url) {
 
@@ -399,20 +419,23 @@ get_raw_counts_from_url <- function(data_url) {
     }
 
     message(paste("Downloading", filename))
-    tryCatch({
-        download.file(data_url, filename, method = "wget", quiet = TRUE)
-    }, error = function(e) {
-        write_warning(paste("ERROR WHILE DOWNLOADING:", filename))
-        return(NULL)
-    })
+    download_status <- download_file(data_url, filename)
+    if (download_status == "FAILURE") {
+      return(NULL)
+    }
 
     separator <- NULL
     for (sep in c("\t", ",", " ")) {
+
         # parsing the first line to determine the separator and see if there is a header
-        counts <- read.table(filename, header = FALSE, sep = sep, row.names = 1, nrows = 1)
-        if (ncol(counts) > 0) {
+        first_line <- parse_first_line(filename, sep)
+        if (is.null(first_line)) {
+          return(NULL)
+        }
+
+        if (ncol(first_line) > 0) {
             separator <- sep
-            if (is.numeric(counts[1, 1])) {
+            if (is.numeric(first_line[1, 1])) {
                 has_header <- FALSE
             } else {
                 has_header <- TRUE
@@ -430,7 +453,7 @@ get_raw_counts_from_url <- function(data_url) {
     tryCatch({
       counts <- read.table(filename, header = has_header, sep = separator, row.names = 1)
     }, error = function(e) {
-        write_warning(paste("ERROR WHILE PARSING", filename, ":", e))
+        write_warning(paste("ERROR WHILE PARSING", filename))
         return(NULL)
     })
 
@@ -793,6 +816,8 @@ main <- function() {
           write_warning(paste("UNSUPPORTED PLATFORM:", series$experiment_type))
         }
     }
+
+    message("Done")
 }
 
 

diff --git a/bin/get_eatlas_accessions.py b/bin/get_eatlas_accessions.py
@@ -446,6 +446,7 @@ def main():
     # getting accessions of selected experiments
     selected_accessions = [exp_dict["accession"] for exp_dict in results]
 
+    sampling_status = "ok"
     if args.random_sampling_size and args.random_sampling_seed:
         selected_accession_to_nb_samples = [
             {
@@ -469,11 +470,13 @@ def main():
             f"Kept {len(selected_accessions)} experiments after random sampling"
         )
 
-        # writing status to file
-        # so that the wrapper module can get the status
-        with open(SAMPLING_QUOTA_OUTFILE, "w") as fout:
-            sampling_status = "full" if sampling_quota_reached else "ok"
-            fout.write(sampling_status)
+        if sampling_quota_reached:
+            sampling_status = "full"
+
+    # writing status to file
+    # so that the wrapper module can get the status
+    with open(SAMPLING_QUOTA_OUTFILE, "w") as fout:
+        fout.write(sampling_status)
 
     # keeping metadata only for selected experiments
     selected_experiments = get_metadata_for_selected_experiments(experiments, results)

diff --git a/bin/merge_counts.py b/bin/merge_counts.py
@@ -11,7 +11,6 @@
 
 import config
 import polars as pl
-from tqdm import tqdm
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -41,7 +40,7 @@ def parse_args():
 
 def get_lazyframes(files: list[Path]) -> list[pl.LazyFrame]:
     """Get a list of LazyFrames from a list of files."""
-    return [pl.scan_parquet(file, low_memory=True) for file in tqdm(files)]
+    return [pl.scan_parquet(file, low_memory=True) for file in files]
 
 
 def get_columns(lf: pl.LazyFrame) -> list[str]:
@@ -99,7 +98,7 @@ def collect_all_gene_ids(lfs: list[pl.LazyFrame]) -> pl.DataFrame:
     """
     logger.info("Getting the full list of gene IDs")
     gene_id_set = set()
-    for lf in tqdm(lfs):
+    for lf in lfs:
         lf_gene_ids = lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list()
         gene_id_set.update(lf_gene_ids)
     return pl.DataFrame({config.GENE_ID_COLNAME: sorted(list(gene_id_set))})
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,4 @@ modules/nf-core/ @@
     subworkflows/nf-core/
     galaxy/
     docs/
+    tests/act