diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml index 3b9724c..77778a0 100644 --- a/.github/actions/nf-test/action.yml +++ b/.github/actions/nf-test/action.yml @@ -56,6 +56,12 @@ runs: channel-priority: strict conda-remove-defaults: true + - name: Configure Nextflow secrets + shell: bash + run: | + nextflow secrets set ENA_WEBIN "$WEBIN_ACCOUNT" + nextflow secrets set ENA_WEBIN_PASSWORD "$WEBIN_PASSWORD" + - name: Run nf-test shell: bash env: diff --git a/CITATIONS.md b/CITATIONS.md index 8934342..495f422 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -14,6 +14,40 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [CoverM](https://github.com/wwood/CoverM) + +> Aroney ST, Newell RJ, Nissen JN, Camargo AP, Tyson GW, Woodcroft BJ. CoverM: Read alignment statistics for metagenomics. Bioinformatics. 2025;41(4):btaf147. doi: 10.1093/bioinformatics/btaf147. PubMed PMID: 40193404; PubMed Central PMCID: PMC11993303. + +- [CheckM2](https://github.com/chklovski/CheckM2) + +> Chklovski A, Parks DH, Woodcroft BJ, Tyson GW. CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nat Methods. 2023;20(8):1203-1212. doi: 10.1038/s41592-023-01940-w. PubMed PMID: 37500759; PubMed Central PMCID: not available. + +- [CAT and BAT](https://doi.org/10.1186/s13059-019-1817-x) + +> von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biol. 2019;20(1):217. doi: 10.1186/s13059-019-1817-x. PubMed PMID: 31640809; PubMed Central PMCID: PMC6805573. + +- [tRNAscan-SE 2.0](https://doi.org/10.1093/nar/gkab688) + +> Chan PP, Lin BY, Mak AJ, Lowe TM. tRNAscan-SE 2.0: Improved detection and functional classification of transfer RNA genes. Nucleic Acids Res. 2021;49(16):9077-9096. doi: 10.1093/nar/gkab688. PubMed PMID: 34417604; PubMed Central PMCID: PMC8450103. + +- [barrnap](https://github.com/tseemann/barrnap) + + > Seemann T. Barrnap: rapid ribosomal RNA prediction. GitHub repository. https://github.com/tseemann/barrnap + +## Submission and helper tools + +- [ENA Webin-CLI](https://github.com/enasequence/webin-cli) + + > European Nucleotide Archive. Webin command line submission interface (Webin-CLI). GitHub repository. https://github.com/enasequence/webin-cli + +- [assembly_uploader](https://github.com/EBI-Metagenomics/assembly_uploader) + + > EBI Metagenomics. ENA Metagenome Assembly uploader. GitHub repository. https://github.com/EBI-Metagenomics/assembly_uploader + +- [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) + + > EBI Metagenomics. ENA public Bins and MAGs uploader. GitHub repository. https://github.com/EBI-Metagenomics/genome_uploader + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index fb8c2cf..0a65eb9 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ Currently, the pipeline supports three submission modes, each routed to a dedica Setup your environment secrets before running the pipeline: -`nextflow secrets set WEBIN_ACCOUNT "Webin-XXX"` +`nextflow secrets set ENA_WEBIN "Webin-XXX"` -`nextflow secrets set WEBIN_PASSWORD "XXX"` +`nextflow secrets set ENA_WEBIN_PASSWORD "XXX"` Make sure you update commands above with your authorised credentials. @@ -55,35 +55,44 @@ The input must follow `assets/schema_input_genome.json`. Required columns: - `sample` -- `fasta` (must end with `.fa.gz` or `.fasta.gz`) +- `fasta` (must end with `.fa.gz`, `.fasta.gz`, or `.fna.gz`) - `accession` - `assembly_software` - `binning_software` - `binning_parameters` -- `stats_generation_software` - `metagenome` - `environmental_medium` - `broad_environment` - `local_environment` - `co-assembly` -Columns that required for now, but will be optional in the nearest future: +At least one of the following must be provided per row: +- reads (`fastq_1`, optional `fastq_2` for paired-end) +- `genome_coverage` + +Additional supported columns: + +- `stats_generation_software` - `completeness` - `contamination` -- `genome_coverage` - `RNA_presence` - `NCBI_lineage` -Those fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package. +If `genome_coverage`, `stats_generation_software`, `completeness`, `contamination`, `RNA_presence`, or `NCBI_lineage` are missing, the workflow can calculate or infer them when the required inputs are available. + +Those fields are metadata required for the [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package. -Example `samplesheet_genome.csv`: +Example `samplesheet_genomes.csv`: ```csv -sample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage -lachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria +sample,fasta,accession,fastq_1,fastq_2,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage +lachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,,,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s__unclassified_Proteobacteria ``` +> [!IMPORTANT] +> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. + ### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`) The input must follow `assets/schema_input_assembly.json`. @@ -91,7 +100,7 @@ The input must follow `assets/schema_input_assembly.json`. Required columns: - `sample` -- `fasta` (must end with `.fa.gz` or `.fasta.gz`) +- `fasta` (must end with `.fa.gz`, `.fasta.gz`, or `.fna.gz`) - `run_accession` - `assembler` - `assembler_version` @@ -111,6 +120,9 @@ assembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,, assembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 ``` +> [!IMPORTANT] +> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. + ## Usage > [!NOTE] @@ -122,6 +134,10 @@ All data submitted through this pipeline must be associated with an ENA study (p See the [usage documentation](docs/usage.md#submission-study) for more details. +### Database setup (`CheckM2` and `CAT_pack`) + +The `mags`/`bins` workflow requires databases for completeness/contamination estimation and taxonomy assignment. See [Usage documentation](usage.md) for details. + ### Required parameters: | Parameter | Description | @@ -137,7 +153,7 @@ See the [usage documentation](docs/usage.md#submission-study) for more details. | Parameter | Description | | ------------------- | ---------------------------------------------------------------------------------------- | | `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false | -| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: false | +| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: true | | `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true | General command template: @@ -202,8 +218,8 @@ For more details and further functionality, please refer to the [usage documenta Key output locations in `--outdir`: -- `upload/manifests/`: generated manifest files for submission -- `upload/webin_cli/`: ENA Webin CLI reports +- `mags/` or `bins/`: genome metadata, manifests, and per-sample submission support files +- `metagenomic_assemblies/`: assembly metadata CSVs and per-sample coverage files - `multiqc/`: MultiQC summary report - `pipeline_info/`: execution reports, trace, DAG, and software versions diff --git a/assets/samplesheet_genomes.csv b/assets/samplesheet_genomes.csv index 44a8138..4db4f00 100644 --- a/assets/samplesheet_genomes.csv +++ b/assets/samplesheet_genomes.csv @@ -1,3 +1,3 @@ -sample,fasta,accession,fastq_1,fastq_2,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,rRNA_presence,NCBI_lineage -lachnospira_eligens,https://github.com/nf-core/test-datasets/raw/seqsubmit/test_data/bins/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,mags_v1,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,False,marine,cable bacteria,marine sediment,False,d__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobulbaceae;g__Candidatus Electrothrix;s__ -lachnospiraceae,https://github.com/nf-core/test-datasets/raw/seqsubmit/test_data/bins/bin_lachnospiraceae.fa.gz,SRR24458087,spades_v3.15.5,mags_v1,default,CheckM2_v1.0.1,92.81,1.09,66.04,sediment metagenome,False,marine,cable bacteria,marine sediment,False,d__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobulbaceae;g__Candidatus Electrothrix;s__Candidatus Electrothrix marina +sample,fasta,accession,fastq_1,fastq_2,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage +lachnospira_eligens,https://github.com/nf-core/test-datasets/raw/seqsubmit/test_data/bins/bin_lachnospira_eligens.fa.gz,SRR24458089,,,spades_v3.15.5,mags_v1,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable bacteria,marine sediment,No,d__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobulbaceae;g__Candidatus Electrothrix;s__ +lachnospiraceae,https://github.com/nf-core/test-datasets/raw/seqsubmit/test_data/bins/bin_lachnospiraceae.fa.gz,SRR24458087,,,spades_v3.15.5,mags_v1,default,CheckM2_v1.0.1,92.81,1.09,66.04,sediment metagenome,No,marine,cable bacteria,marine sediment,No,d__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobulbaceae;g__Candidatus Electrothrix;s__Candidatus Electrothrix marina diff --git a/assets/schema_input_assembly.json b/assets/schema_input_assembly.json index 3b55e28..2b5fcea 100644 --- a/assets/schema_input_assembly.json +++ b/assets/schema_input_assembly.json @@ -17,8 +17,8 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?a\\.gz$", - "errorMessage": "FASTA file must be provided and have extension '.fa', '.fasta', '.fas', '.fna' (optionally gzipped)", + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.(fa|fasta|fna)\\.gz$", + "errorMessage": "FASTA file must be provided and have extension '.fa.gz', '.fasta.gz', '.fna.gz'", "description": "Metagenomic assembly FASTA file" }, "fastq_1": { diff --git a/assets/schema_input_genome.json b/assets/schema_input_genome.json index dfd01c3..f851f85 100644 --- a/assets/schema_input_genome.json +++ b/assets/schema_input_genome.json @@ -17,8 +17,8 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?a\\.gz$", - "errorMessage": "FASTA file for sequences 1 must be provided, cannot contain spaces and must have extension '.fa.gz' or '.fasta.gz'", + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.(fa|fasta|fna)\\.gz$", + "errorMessage": "FASTA file for sequences 1 must be provided, cannot contain spaces and must have extension '.fa.gz', '.fasta.gz', or '.fna.gz'", "description": "MAG/bin sequence file" }, "accession": { @@ -117,6 +117,7 @@ "required": [ "sample", "fasta", + "accession", "assembly_software", "co-assembly", "binning_software", diff --git a/conf/base.config b/conf/base.config index 42eb0c6..6a56c96 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { 1 * task.attempt } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } @@ -24,8 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } memory = { 6.GB * task.attempt } diff --git a/conf/modules.config b/conf/modules.config index f828bef..7f87441 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,40 +12,138 @@ process { + // Default publishDir for all processes, can be overridden by individual process definitions below publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + // + // VALIDATION OF INPUT FASTA FILES + // + + withName: 'FASTAVALIDATOR' { + publishDir = [ + enabled: false + ] + } + + // + // TAXOMIC CLASSIFICATION SUBWORKFLOW + // + withName: 'CATPACK_ADDNAMES_BINS' { ext.args = '--only_official' publishDir = [ - path: { "${params.outdir}/${params.mode}/taxonomy" }, + path: { "${params.outdir}/${params.mode}/${meta.id}/taxonomy" }, mode: params.publish_dir_mode, pattern: "*.txt", + ] + } + + withName: 'CATPACK_BINS' { + publishDir = [ + path: { "${params.outdir}/${params.mode}/${meta.id}/taxonomy" }, + mode: params.publish_dir_mode, + pattern: "*.bin2classification.txt" + ] + } + + withName: 'CATPACK_PREPARE' { + publishDir = [ + [ + path: { "${params.outdir}/databases/cat_pack/" }, + mode: params.publish_dir_mode, + pattern: '*/db/', + saveAs: { filename -> "db" } + ], + [ + path: { "${params.outdir}/databases/cat_pack/" }, + mode: params.publish_dir_mode, + pattern: '*/tax/', + saveAs: { filename -> "tax" } + ] + ] + } + + withName: 'RENAME_FASTA_FOR_CATPACK|CAT_DB_UNTAR' { + publishDir = [ + enabled: false + ] + } + + // + // RNA DETECTION SUBWORKFLOW + // + + withName: 'BARRNAP' { + publishDir = [ + path: { "${params.outdir}/${params.mode}/${meta.id}/rna/barrnap" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'TRNASCANSE' { + publishDir = [ + path: { "${params.outdir}/${params.mode}/${meta.id}/rna/trnascanse" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'COUNT_RNA' { + publishDir = [ + enabled: false + ] + } + + // + // GENOME QUALITY ASSESSMENT SUBWORKFLOW + // + withName: 'CHECKM2_PREDICT' { publishDir = [ - path: { "${params.outdir}/${params.mode}/checkm2" }, + path: { "${params.outdir}/${params.mode}/${meta.id}/checkm2" }, mode: params.publish_dir_mode, pattern: "*_checkm2_report.tsv", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'CHECKM2_DATABASEDOWNLOAD' { + publishDir = [ + path: { "${params.outdir}/databases/checkm2/" }, + mode: params.publish_dir_mode, + pattern: '*.dmnd' + ] + } + + // + // COVERAGE CALCULATION + // + withName: 'COVERM_GENOME' { ext.args = '--min-covered-fraction 0 --methods mean' publishDir = [ - path: { "${params.outdir}/${params.mode}/coverage" }, + path: { "${params.outdir}/${params.mode}/${meta.id}/coverage" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'COVERM_CONTIG' { + publishDir = [ + path: { "${params.outdir}/${params.mode}/${meta.id}/coverage" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + // + // SUBMISSION AND MANIFEST GENERATION + // + withName: 'GENOME_UPLOAD' { publishDir = [ path: { "${params.outdir}/${params.mode}/upload/manifests" }, @@ -54,24 +152,36 @@ process { ] } - withName: 'ENA_WEBIN_CLI' { + withName: 'ENA_WEBIN_CLI_WRAPPER' { + ext.args = { params.mode == 'mags' || params.mode == 'bins' || params.mode == 'metagenomic_assemblies' ? "--context genome": "--context reads"} publishDir = [ - path: { "${params.outdir}/${params.mode}/upload/webin_cli" }, + path: { "${params.outdir}/${params.mode}/${meta.id}/upload/webin_cli" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'GENERATE_ASSEMBLY_MANIFEST|REGISTERSTUDY' { + ext.args = { params.test_upload ? "--test" : "" } + } + + withName: 'ENA_WEBIN_CLI_DOWNLOAD|REGISTERSTUDY|GENERATE_ASSEMBLY_MANIFEST' { + publishDir = [ + enabled: false + ] + } + + // + // MULTIQC REPORT + // + withName: 'MULTIQC' { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ - path: { "${params.outdir}/${params.mode}/multiqc" }, + path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' { - ext.args = { params.test_upload ? "--test" : "" } - } } diff --git a/conf/test.config b/conf/test.config index bb8ae69..8f56178 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,7 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + process { resourceLimits = [ cpus: 2, - memory: '15.GB', + memory: '12.GB', time: '1.h' ] } + +params { + config_profile_name = 'Test profile for minimal pipeline test' + config_profile_description = 'Runs --mode mags with multiple mags and missing metadata' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_multiple_bins_missing_metadata.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = params.pipelines_testdata_base_path + 'seqsubmit/test_data/small_cat_db/tax-db.tar.gz' + checkm2_db = null + +} diff --git a/conf/test_genome.config b/conf/test_assembly_complete_metadata.config similarity index 58% rename from conf/test_genome.config rename to conf/test_assembly_complete_metadata.config index bab14f9..16e4ccf 100644 --- a/conf/test_genome.config +++ b/conf/test_assembly_complete_metadata.config @@ -13,23 +13,21 @@ process { resourceLimits = [ cpus: 2, - memory: '20.GB', + memory: '8.GB', time: '1.h' ] } params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test --mode metagenomic_assemblies complete_metadata profile' + config_profile_description = 'Single-case assembly test with complete metadata values provided' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/samplesheet_genomesubmit.csv' + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_complete_metadata.csv' outdir = 'test_output' - mode = "mags" - submission_study = "PRJEB98843" - centre_name = "TEST_CENTER" + mode = "metagenomic_assemblies" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" } diff --git a/conf/test_assembly.config b/conf/test_assembly_no_coverage_paired_reads.config similarity index 56% rename from conf/test_assembly.config rename to conf/test_assembly_no_coverage_paired_reads.config index 389e102..65c73e4 100644 --- a/conf/test_assembly.config +++ b/conf/test_assembly_no_coverage_paired_reads.config @@ -19,17 +19,15 @@ process { } params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test --mode metagenomic_assemblies no_coverage_paired_reads profile' + config_profile_description = 'Single-case assembly test with missing coverage and paired-end reads' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'assets/samplesheet_assembly.csv' + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_no_coverage_paired_reads.csv' outdir = 'test_output' - mode = "metagenomic_assemblies" - submission_study = "PRJEB98843" - centre_name = "TEST_CENTER" + mode = "metagenomic_assemblies" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" } diff --git a/conf/test_assembly_no_coverage_single_reads.config b/conf/test_assembly_no_coverage_single_reads.config new file mode 100644 index 0000000..814de31 --- /dev/null +++ b/conf/test_assembly_no_coverage_single_reads.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode metagenomic_assemblies no_coverage_single_reads profile' + config_profile_description = 'Single-case assembly test with missing coverage and single-end reads' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_no_coverage_single_reads.csv' + outdir = 'test_output' + + mode = "metagenomic_assemblies" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + +} diff --git a/conf/test_assembly_one_contig.config b/conf/test_assembly_one_contig.config new file mode 100644 index 0000000..b27784e --- /dev/null +++ b/conf/test_assembly_one_contig.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode metagenomic_assemblies one_contig profile' + config_profile_description = 'Single-case assembly test with one contig assembly' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_one_contig.csv' + outdir = 'test_output' + + mode = "metagenomic_assemblies" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + +} diff --git a/conf/test_mag_complete_metadata.conf b/conf/test_mag_complete_metadata.conf new file mode 100644 index 0000000..a25e31b --- /dev/null +++ b/conf/test_mag_complete_metadata.conf @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags complete_metadata profile' + config_profile_description = 'Single-case MAG test with complete metadata values provided' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_complete_metadata.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} diff --git a/conf/test_mag_no_coverage_paired_reads.conf b/conf/test_mag_no_coverage_paired_reads.conf new file mode 100644 index 0000000..954a9a2 --- /dev/null +++ b/conf/test_mag_no_coverage_paired_reads.conf @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags no_coverage_paired_reads profile' + config_profile_description = 'Single-case MAG test with missing genome_coverage and paired-end reads' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_no_coverage_paired_reads.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} diff --git a/conf/test_mag_no_coverage_single_reads.conf b/conf/test_mag_no_coverage_single_reads.conf new file mode 100644 index 0000000..4892424 --- /dev/null +++ b/conf/test_mag_no_coverage_single_reads.conf @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags no_coverage_single_reads profile' + config_profile_description = 'Single-case MAG test with missing genome_coverage and single-end reads' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_no_coverage_single_reads.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} diff --git a/conf/test_mag_no_quality.conf b/conf/test_mag_no_quality.conf new file mode 100644 index 0000000..9d3724a --- /dev/null +++ b/conf/test_mag_no_quality.conf @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags no_quality profile' + config_profile_description = 'Single-case MAG test with missing completeness and contamination values' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_no_quality.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + // CheckM2 doesn't allow usage of small test database, so real database will be downloaded during the test run, + // which is not ideal but necessary to test the pipeline's behaviour with missing quality values + checkm2_db = null + +} diff --git a/conf/test_mag_no_rna_presence.conf b/conf/test_mag_no_rna_presence.conf new file mode 100644 index 0000000..1577455 --- /dev/null +++ b/conf/test_mag_no_rna_presence.conf @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags no_rna_presence profile' + config_profile_description = 'Single-case MAG test with missing RNA_presence value' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_no_rna_presence.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} diff --git a/conf/test_mag_no_study_complete_metadata.config b/conf/test_mag_no_study_complete_metadata.config index aea18b1..18ccfad 100644 --- a/conf/test_mag_no_study_complete_metadata.config +++ b/conf/test_mag_no_study_complete_metadata.config @@ -13,7 +13,7 @@ process { resourceLimits = [ cpus: 2, - memory: '16.GB', + memory: '12.GB', time: '1.h' ] } diff --git a/conf/test_mag_no_taxonomy.conf b/conf/test_mag_no_taxonomy.conf new file mode 100644 index 0000000..bb9f687 --- /dev/null +++ b/conf/test_mag_no_taxonomy.conf @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags no_taxonomy profile' + config_profile_description = 'Single-case MAG test with missing NCBI_lineage value' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_no_taxonomy.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = params.pipelines_testdata_base_path + 'seqsubmit/test_data/small_cat_db/tax-db.tar.gz' + checkm2_db = null + +} diff --git a/conf/test_mag_one_contig.conf b/conf/test_mag_one_contig.conf new file mode 100644 index 0000000..d9f5702 --- /dev/null +++ b/conf/test_mag_one_contig.conf @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '12.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags one_contig profile' + config_profile_description = 'Single-case MAG test with one contig assembly' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_one_contig.csv' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} diff --git a/docs/methods.md b/docs/methods.md index 522ad49..0f15a33 100644 --- a/docs/methods.md +++ b/docs/methods.md @@ -7,7 +7,7 @@ - `GENOMESUBMIT` for `--mode mags` and `--mode bins` - `ASSEMBLYSUBMIT` for `--mode metagenomic_assemblies` -This page documents the methods that are currently implemented in the pipeline and includes placeholders for methods that will be documented once they are implemented. +This page documents the methods that are currently implemented in the pipeline. ## `GENOMESUBMIT` methods @@ -16,19 +16,21 @@ This page documents the methods that are currently implemented in the pipeline a The `GENOMESUBMIT` workflow: 1. Reads the samplesheet and associated genome FASTA files. -2. Reuses `RNA_presence` values supplied in the samplesheet when they are already present. -3. Calculates `RNA_presence` internally for entries where this field is missing. -4. Collects genome metadata into the tabular format required by `genome_upload`. +2. Validates genome FASTA files. +3. Reuses provided or calculates missing values for RNA genes presence, coverage, taxonomy, and genome quality metrics. +4. Collects genome metadata into the tabular format required by `genome_uploader`. 5. Generates submission manifests for ENA. 6. Performs submission to ENA. -### RNA presence detection +### Genome FASTA validation -#### When RNA detection runs +Genome FASTA files are validated with the `FASTAVALIDATOR` module before downstream processing. Each file is checked for FASTA format validity and contig count. A genome must contain at least two contigs to pass validation, which is an ENA requirement for contig-level submissions. -The workflow only runs internal RNA detection for entries where the `RNA_presence` column is empty. If a value is already supplied in the samplesheet, that value is passed through unchanged. +Only FASTA files that pass validation are retained for downstream processing and submission. -#### Tools used +### RNA presence detection + +The workflow only runs internal RNA detection for entries where the `RNA_presence` column is empty. If a value is already supplied in the samplesheet, that value is passed through unchanged. RNA detection is implemented through the `RNA_DETECTION` subworkflow and combines: @@ -64,7 +66,7 @@ $$ If multiple hits are found for the same subunit, the workflow keeps the best recovered percentage for that subunit. -A subunit is considered present when its best recovered percentage is greater than or equal to `params.rrna_limit`. The current default is `80`. +A subunit is considered present when its best recovered percentage is greater than or equal to `--rrna_limit`. The current default is `80`. #### tRNA detection @@ -90,39 +92,25 @@ The result is written as a two-column TSV file containing the genome identifier ### Genome coverage calculation -> [!NOTE] -> Placeholder section. -> -> This section will describe the internal genome coverage calculation once it is implemented in the workflow. -> -> For now, `genome_coverage` is treated as submission metadata. +Entries that already contain `genome_coverage` are passed through unchanged. For entries where coverage is missing, the workflow joins validated FASTA files with their associated read files and runs `coverm genome` through the `COVERM_GENOME` module (single-end or paired-end mode is selected from sample metadata). + +Genome coverage values from `coverm genome` output TSV are parsed and merged into submission metadata. ### Taxonomy assignment -> [!NOTE] -> Placeholder section. -> -> This section will describe the taxonomy assignment method once it is implemented in the workflow. -> -> For now, taxonomy is expected to be provided by the user in the `NCBI_lineage` column. +If `NCBI_lineage` is already present in the input samlesheet, the value is retained. If it is missing, the workflow runs taxonomy classification using the `CAT_pack` tool. + +Before classification, input FASTA files are normalized to a `.fasta` suffix by `RENAME_FASTA_FOR_CATPACK`. Classification is then run in bin mode (`CAT_pack bins`, followed by `CAT_pack add_names`). -### Completeness assessment +Database input is taken from `--cat_db` when provided; otherwise the workflow uses `--cat_db_download_id` to download and prepare a CATPACK database. The resulting classification table is parsed, and the lineage field is written to `NCBI_lineage`. -> [!NOTE] -> Placeholder section. -> -> This section will describe the completeness estimation method once it is implemented in the workflow. -> -> For now, completeness is expected to be provided by the user in the `completeness` column. +### Completeness and contamination assessment -### Contamination assessment +Completeness and contamination are evaluated together in a shared genome quality step. -> [!NOTE] -> Placeholder section. -> -> This section will describe the contamination estimation method once it is implemented in the workflow. -> -> For now, contamination is expected to be provided by the user in the `contamination` column. +The workflow checks three samplesheet fields: `completeness`, `contamination`, and `stats_generation_software`. If all three are already present, those values are reused. If any of them is missing, the genome is analysed with `CheckM2 predict`. If `--checkm2_db` is supplied and exists, that database is used directly. Otherwise, the workflow downloads a CheckM2 database from Zenodo (using the configured database ID) and then runs prediction. + +For records that run `CheckM2`, completeness and contamination are extracted from the generated quality report (`quality_report.tsv`) and used `CheckM2` version is recorded as `stats_generation_software`. ## `ASSEMBLYSUBMIT` methods @@ -139,17 +127,19 @@ The `ASSEMBLYSUBMIT` workflow: ### Assembly FASTA validation -Assembly FASTA files are validated with `FASTAVALIDATOR` before downstream processing. Only assemblies that pass validation continue to the coverage and submission steps. +Assembly FASTA files are validated with `FASTAVALIDATOR` before downstream processing. Each file is checked for FASTA format validity and contig count. A genome must contain at least two contigs to pass validation, which is an ENA requirement for contig-level submissions. + +Only assemblies with successful validation are forwarded to coverage estimation, metadata/manifest generation and submission. ### Coverage calculation If the `coverage` column is already populated in the samplesheet, that value is used directly. -If `coverage` is missing, the workflow joins each validated assembly with its associated read files and calculates coverage with `coverm contig`. +If `coverage` is missing, the workflow calculates coverage with `coverm contig`. `coverm contig` outputs per-contig depth. The workflow then reads this file and calculates the arithmetic mean across all contigs. -If the per-contig coverage values are $c_1, c_2, \ldots, c_n$, the workflow currently computes assembly coverage as nweighted mean across contigs: +If the per-contig coverage values are $c_1, c_2, \ldots, c_n$, the workflow currently computes assembly coverage as an unweighted mean across contigs: $$ \bar{c} = \frac{1}{n} \sum_{i=1}^{n} c_i diff --git a/docs/output.md b/docs/output.md index ae97140..ff2ac63 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,18 +2,53 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the `nf-core/seqsubmit` pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - +The directories listed below will be created in the results directory (set with `--outdir`) after the pipeline has finished. All paths are relative to the top-level results directory. ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and performs automated submission of sequence data to ENA. Exact steps and generated outputs depend on the data type and `--mode` executed (`mags`, `bins` or `metagenomic_assemblies`). + +## `mags` and `bins` outputs + +When `--mode mags` or `--mode bins` is used, results are written under `mags/` or `bins/`. + +
+Output files + +- `/` + - `genomes_metadata.csv`: tabular metadata assembled for submission. + - `upload/manifests/`: manifests generated by `genome_uploader`. + - `databases/checkm2/`: downloaded CheckM2 database file (when downloaded during the run). + - `databases/cat_pack/`: prepared CAT_pack database directories (when generated during the run). + - `/coverage/`: `coverm genome` output for samples where `genome_coverage` had to be computed. + - `/rna/barrnap/`: `barrnap` output GFF for samples where `RNA_presence` had to be inferred. + - `/rna/trnascanse/`: `tRNAscan-SE` outputs for the same RNA inference step. + - `/taxonomy/`: CAT/BAT taxonomic classification for samples where `NCBI_lineage` was missing. + - `/checkm2/`: CheckM2 reports for samples where completeness/contamination metadata had to be computed. + - `/upload/webin_cli/`: Webin-CLI wrapper output TSV with accessions for submitted genomes. + +
+ +Many of these per-sample directories are conditional. For example, if `genome_coverage` is already provided in the samplesheet, the corresponding `coverage/` directory will not be created for that sample. + +## `metagenomic_assemblies` outputs + +When `--mode metagenomic_assemblies` is used, results are written under `metagenomic_assemblies/`. + +
+Output files + +- `metagenomic_assemblies/` + - `_assembly_metadata.csv`: per-assembly metadata CSV generated for manifest creation. + - `/coverage/`: `coverm contig` output for samples where `coverage` had to be calculated from reads. + +
+ +Assembly study registration, manifest generation, and Webin-CLI submission are executed by the workflow, but their intermediate outputs are not currently published into `--outdir` by the pipeline. -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +## Common outputs ### MultiQC diff --git a/docs/usage.md b/docs/usage.md index ad32375..f6358c0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -24,8 +24,8 @@ Before running the pipeline, make sure that: Set your Webin credentials as Nextflow secrets: ```bash -nextflow secrets set WEBIN_ACCOUNT "Webin-XXX" -nextflow secrets set WEBIN_PASSWORD "XXX" +nextflow secrets set ENA_WEBIN "Webin-XXX" +nextflow secrets set ENA_WEBIN_PASSWORD "XXX" ``` ## Samplesheet input @@ -47,10 +47,13 @@ sample,fasta,accession,fastq_1,fastq_2,assembly_software,binning_software,binnin mag_001,data/mag_001.fasta.gz,SRR24458089,,,SPAdes 3.15.5,MetaBAT2 2.15,default,CheckM2 1.0.1,92.81,1.09,66.04,sediment metagenome,No,marine,cable bacteria,marine sediment,No,d__Bacteria;p__Proteobacteria;s__ ``` +> [!IMPORTANT] +> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. + | Column | Description | | --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Unique identifier of this particular data entry. It is used as the alias when submitting to ENA, so it must be unique within one Webin account. | -| `fasta` | Path to MAG/bin contigs in FASTA format compressed with `gzip`. | +| `fasta` | Path to MAG/bin contigs in FASTA format compressed with `gzip`. All names of the FASTA files must be unique to prevent pipeline errors. | | `accession` | ENA accession of the run or metagenomic assembly used to generate the MAG/bin. | | `fastq_1` | Path to the read file in FASTQ format used to generate the source metagenomic assembly. Required if `genome_coverage` is not provided. | | `fastq_2` | Path to the second read file in FASTQ format for paired-end data used to generate the source metagenomic assembly. Leave empty for single-end reads. | @@ -86,6 +89,9 @@ assembly_001,data/assembly_001.fasta.gz,data/assembly_001_R1.fastq.gz,data/assem assembly_002,data/assembly_002.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 ``` +> [!IMPORTANT] +> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown. + | Column | Description | | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Unique identifier of this particular data entry. It is used as the alias when submitting to ENA, so it must be unique within one Webin account. | @@ -163,6 +169,28 @@ study-soil-2026 Soil microbiome study Survey of soil microbiota An example metadata file is available at [assets/study_metadata.json](../assets/study_metadata.json). +## Database preparation (`mags` / `bins`) + +The `GENOMESUBMIT` workflow uses `CheckM2` and `CAT_pack` that require specialized databases for completeness/contamination assessment and taxonomy assignment. + +You can either provide pre-existing databases or let the pipeline prepare them during execution. + +- `CheckM2`: + - provide the path to local database with `--checkm2_db`, otherwise the pipeline downloads version specified with `--checkm2_db_zenodo_id` (by default `14897628`). + +- `CAT_pack`: + - provide the path to local database (containing `tax/` and `db/` folders or `tar.gz` archive) with `--cat_db`, otherwise the pipeline constructs version specified with `--cat_db_download_id` (by default `nr`). + +See [CAT_pack documentation](https://github.com/MGXlab/CAT_pack) and [CheckM2 documentation](https://github.com/chklovski/CheckM2) for more details on usage and creation of databases. + +> [!IMPORTANT] +> `CAT_pack` database creation can take significant time. +> +> Reusing an existing database is strongly recommended for repeated runs. +> +> Databases created/downloaded by the pipeline are published under: +> `${params.outdir}/databases/` + ## Running the pipeline General command template: diff --git a/modules.json b/modules.json index 158c21f..3e38af7 100644 --- a/modules.json +++ b/modules.json @@ -69,7 +69,8 @@ "multiqc": { "branch": "master", "git_sha": "af27af1be706e6a2bb8fe454175b0cdf77f47b49", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/multiqc/multiqc.diff" }, "trnascanse": { "branch": "master", diff --git a/modules/local/count_rna/tests/data/ecoli.stats b/modules/local/count_rna/tests/data/ecoli.stats new file mode 100644 index 0000000..4f7cfe3 --- /dev/null +++ b/modules/local/count_rna/tests/data/ecoli.stats @@ -0,0 +1,84 @@ + +tRNAscan-SE v.2.0.12 (Nov 2022) scan results (on host 311e9175be92) +Started: Wed Mar 25 17:02:42 UTC 2026 + +------------------------------------------------------------ +Sequence file(s) to search: ecoli.fna +Search Mode: Eukaryotic +Results written to: lachnospiraceae.tsv +Output format: Tabular +Searching with: Infernal First Pass->Infernal +Isotype-specific model scan: Yes +Covariance model: /usr/local/lib/tRNAscan-SE/models/TRNAinf-euk.cm + /usr/local/lib/tRNAscan-SE/models/TRNAinf-euk-SeC.cm +Infernal first pass cutoff score: 10 + +Temporary directory: . +Search log saved in: lachnospiraceae.log +Search statistics saved in: lachnospiraceae.stats +------------------------------------------------------------ + +First-pass Stats: +--------------- +Sequences read: 2 +Seqs w/at least 1 hit: 1 +Bases read: 4642052 (x2 for both strands) +Bases in tRNAs: 6562 +tRNAs predicted: 87 +Av. tRNA length: 75 +Script CPU time: 0.81 s +Scan CPU time: 237.75 s +Scan speed: 39.0 Kbp/sec + +First pass search(es) ended: Wed Mar 25 17:04:43 UTC 2026 + +Infernal Stats: +----------- +Candidate tRNAs read: 87 +Infernal-confirmed tRNAs: 87 +Bases scanned by Infernal: 8302 +% seq scanned by Infernal: 0.1 % +Script CPU time: 0.68 s +Infernal CPU time: 90.17 s +Scan speed: 92.1 bp/sec + +Infernal analysis of tRNAs ended: Wed Mar 25 17:05:53 UTC 2026 + +Overall scan speed: 28184.0 bp/sec + +tRNAs decoding Standard 20 AA: 84 +Selenocysteine tRNAs (TCA): 1 +Possible suppressor tRNAs (CTA,TTA,TCA): 0 +tRNAs with undetermined/unknown isotypes: 0 +Predicted pseudogenes: 2 + ------- +Total tRNAs: 87 + +tRNAs with introns: 0 + +| + +Isotype / Anticodon Counts: + +Ala : 5 AGC: GGC: 2 CGC: TGC: 3 +Gly : 6 ACC: GCC: 4 CCC: 1 TCC: 1 +Pro : 3 AGG: GGG: 1 CGG: 1 TGG: 1 +Thr : 5 AGT: GGT: 2 CGT: 2 TGT: 1 +Val : 7 AAC: GAC: 2 CAC: TAC: 5 +Ser : 5 AGA: GGA: 2 CGA: 1 TGA: 1 ACT: GCT: 1 +Arg : 7 ACG: 4 GCG: CCG: 1 TCG: CCT: 1 TCT: 1 +Leu : 8 AAG: GAG: 1 CAG: 4 TAG: 1 CAA: 1 TAA: 1 +Phe : 2 AAA: GAA: 2 +Asn : 4 ATT: GTT: 4 +Lys : 6 CTT: TTT: 6 +Asp : 3 ATC: GTC: 3 +Glu : 4 CTC: TTC: 4 +His : 1 ATG: GTG: 1 +Gln : 4 CTG: 2 TTG: 2 +Ile : 3 AAT: GAT: 3 CAT: TAT: +Met : 8 CAT: 8 +Tyr : 3 ATA: GTA: 3 +Supres : 0 CTA: TTA: TCA: +Cys : 1 ACA: GCA: 1 +Trp : 1 CCA: 1 +SelCys : 1 TCA: 1 diff --git a/modules/local/count_rna/tests/data/ecoli_bac.gff b/modules/local/count_rna/tests/data/ecoli_bac.gff new file mode 100644 index 0000000..aab13bb --- /dev/null +++ b/modules/local/count_rna/tests/data/ecoli_bac.gff @@ -0,0 +1,23 @@ +##gff-version 3 +NC_000913.3 barrnap:0.9 rRNA 223774 225311 0 + . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 225761 228661 0 + . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 228760 228870 1.9e-11 + . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 2726074 2726184 1.9e-11 - . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 2726282 2729182 0 - . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 2729617 2731154 0 - . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3423428 3423538 4.4e-11 - . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3423673 3423783 1.9e-11 - . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3423881 3426781 0 - . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3427222 3428759 0 - . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3941811 3943348 0 + . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3943706 3946606 0 + . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 3946704 3946814 1.9e-11 + . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4035534 4037071 0 + . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4037521 4040422 0 + . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4040521 4040631 2.5e-11 + . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4166662 4168199 0 + . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4168643 4171543 0 + . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4171641 4171751 6.5e-11 + . Name=5S_rRNA;product=5S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4208150 4209687 0 + . Name=16S_rRNA;product=16S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4210045 4212945 0 + . Name=23S_rRNA;product=23S ribosomal RNA +NC_000913.3 barrnap:0.9 rRNA 4213044 4213154 6.5e-11 + . Name=5S_rRNA;product=5S ribosomal RNA diff --git a/modules/local/ena_webin_cli/main.nf b/modules/local/ena_webin_cli/main.nf deleted file mode 100644 index 25b12f4..0000000 --- a/modules/local/ena_webin_cli/main.nf +++ /dev/null @@ -1,61 +0,0 @@ -process ENA_WEBIN_CLI { - tag "$meta.id" - label 'process_low' - - container "quay.io/biocontainers/ena-webin-cli:9.0.1--hdfd78af_1" - - stageInMode 'copy' - secret 'WEBIN_ACCOUNT' - secret 'WEBIN_PASSWORD' - - input: - tuple val(meta), path(submission_item), path(manifest) - - output: - tuple val(meta), path("*webin-cli.report"), emit: webin_report - tuple val(meta), env('STATUS') , emit: upload_status - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - def mode = params.test_upload ? "-test" : "" - def submit_or_validate = params.webincli_submit ? "-submit": "-validate" - - """ - # change FASTA path in manifest to current workdir - export ITEM_FULL_PATH=\$(readlink -f ${submission_item}) - sed 's|^FASTA\t.*|FASTA\t'"\${ITEM_FULL_PATH}"'|g' ${manifest} > ${prefix}_updated_manifest.manifest - - ena-webin-cli \\ - -context=genome \\ - -manifest=${prefix}_updated_manifest.manifest \\ - -userName="\${WEBIN_ACCOUNT}" \\ - -password="\${WEBIN_PASSWORD}" \\ - ${submit_or_validate} \\ - ${mode} - - mv webin-cli.report "${prefix}_webin-cli.report" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ena-webin-cli: \$(ena-webin-cli -version 2>&1 ) - END_VERSIONS - - # status check - if grep -q "submission has been completed successfully" "${prefix}_webin-cli.report"; then - # first time submission completed successfully - export STATUS="success" - true - elif grep -q "object being added already exists in the submission account with accession" "${prefix}_webin-cli.report"; then - # there was attempt to re-submit already submitted genome - export STATUS="success" - true - else - export STATUS="failed" - false - fi - """ -} diff --git a/modules/local/ena_webin_cli/meta.yml b/modules/local/ena_webin_cli/meta.yml deleted file mode 100644 index 53a914f..0000000 --- a/modules/local/ena_webin_cli/meta.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: ena_webin_cli -description: ENA data submission tool using Webin account details -keywords: - - ena - - submission - - upload - - webin -tools: - - ena_webin_cli: - description: | - Data submissions to ENA can be made using the Webin command line submission interface (Webin-CLI). Webin submission account credentials are required to use the program. - documentation: https://github.com/enasequence/webin-cli - licence: ["Apache License"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information. - - submission_item: - type: file - description: | - Target FASTA file for submission (mag/bin/assembly) - - manifest: - type: file - description: | - Submission manifest - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - webin_report: - type: file - description: Submission report - pattern: "*webin-cli.report" - - STATUS: - type: bool - description: Submission status success(true) or failed(false - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@KateSakharova" - - "@ochkalova" diff --git a/modules/local/ena_webin_cli_download/environment.yml b/modules/local/ena_webin_cli_download/environment.yml new file mode 100644 index 0000000..61dc7ba --- /dev/null +++ b/modules/local/ena_webin_cli_download/environment.yml @@ -0,0 +1,6 @@ +--- +channels: + - conda-forge + - bioconda +dependencies: + - "conda-forge::wget" diff --git a/modules/local/ena_webin_cli_download/main.nf b/modules/local/ena_webin_cli_download/main.nf new file mode 100644 index 0000000..b156b58 --- /dev/null +++ b/modules/local/ena_webin_cli_download/main.nf @@ -0,0 +1,23 @@ +process ENA_WEBIN_CLI_DOWNLOAD { + label 'process_single' + + input: + val(version) + + output: + path("webin-cli-*.jar"), emit: webin_cli_jar + + when: + task.ext.when == null || task.ext.when + + script: + + """ + wget https://github.com/enasequence/webin-cli/releases/download/${version}/webin-cli-${version}.jar + """ + + stub: + """ + touch webin-cli-stub.jar + """ +} diff --git a/modules/local/ena_webin_cli_download/meta.yml b/modules/local/ena_webin_cli_download/meta.yml new file mode 100644 index 0000000..99f949b --- /dev/null +++ b/modules/local/ena_webin_cli_download/meta.yml @@ -0,0 +1,35 @@ +name: "ena_webin_cli_download" +description: Downloads the ENA Webin-CLI JAR file from GitHub releases. +keywords: + - ena + - webin + - submission + - download +tools: + - "wget": + description: "A free utility for non-interactive download of files from the web." + homepage: "https://www.gnu.org/software/wget/" + documentation: "https://www.gnu.org/software/wget/manual/" + licence: ["GPL-3.0-or-later"] + identifier: null + +input: + - - version: + type: value + description: | + Version of the Webin-CLI JAR to download. + Example: "7.4.1" + +output: + webin_cli_jar: + - - "webin-cli-*.jar": + type: file + description: The downloaded Webin-CLI JAR file. + pattern: "webin-cli-*.jar" + +authors: + - "@KateSakharova" + - "@ochkalova" +maintainers: + - "@KateSakharova" + - "@ochkalova" diff --git a/modules/local/ena_webin_cli_download/tests/main.nf.test b/modules/local/ena_webin_cli_download/tests/main.nf.test new file mode 100644 index 0000000..888d9c6 --- /dev/null +++ b/modules/local/ena_webin_cli_download/tests/main.nf.test @@ -0,0 +1,51 @@ +nextflow_process { + + name "Test Process ENA_WEBIN_CLI_DOWNLOAD" + script "../main.nf" + process "ENA_WEBIN_CLI_DOWNLOAD" + + tag "modules" + tag "ena_webin_cli_download" + + test("ENA_WEBIN_CLI_DOWNLOAD - downloads webin-cli jar") { + + when { + process { + """ + input[0] = "9.0.3" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.webin_cli_jar.size() == 1 }, + { assert process.out.webin_cli_jar[0].toString().endsWith(".jar") } + ) + } + + } + + test("ENA_WEBIN_CLI_DOWNLOAD - stub") { + + options "-stub" + + when { + process { + """ + input[0] = "9.0.3" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/local/ena_webin_cli_download/tests/main.nf.test.snap b/modules/local/ena_webin_cli_download/tests/main.nf.test.snap new file mode 100644 index 0000000..6f3bc74 --- /dev/null +++ b/modules/local/ena_webin_cli_download/tests/main.nf.test.snap @@ -0,0 +1,19 @@ +{ + "ENA_WEBIN_CLI_DOWNLOAD - stub": { + "content": [ + { + "0": [ + "webin-cli-stub.jar:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "webin_cli_jar": [ + "webin-cli-stub.jar:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-25T16:02:51.004952" + } +} \ No newline at end of file diff --git a/modules/local/ena_webin_cli_wrapper/environment.yml b/modules/local/ena_webin_cli_wrapper/environment.yml new file mode 100644 index 0000000..05f2127 --- /dev/null +++ b/modules/local/ena_webin_cli_wrapper/environment.yml @@ -0,0 +1,7 @@ +--- +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::mgnify-pipelines-toolkit" + - "conda-forge::openjdk" diff --git a/modules/local/ena_webin_cli_wrapper/main.nf b/modules/local/ena_webin_cli_wrapper/main.nf new file mode 100644 index 0000000..7918ce1 --- /dev/null +++ b/modules/local/ena_webin_cli_wrapper/main.nf @@ -0,0 +1,43 @@ +/* + * ena-webin-cli wrapper script that runs ena-webin-cli and handles errors +*/ +process ENA_WEBIN_CLI_WRAPPER { + + label 'process_low' + tag "${meta.id}" + container "quay.io/microbiome-informatics/java_mgnify-pipelines-toolkit:1.4.21" + stageInMode 'copy' + + input: + tuple val(meta), path(submission_item), path(manifest) + path(webin_cli_jar) + + output: + tuple val(meta), path("*_accessions.tsv"), emit: accessions + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def test_flag = params.test_upload ? "--test" : "" + def submit_or_validate = params.webincli_submit ? "--mode submit": "--mode validate" + + """ + # change FASTA path in manifest to current workdir + export ITEM_FULL_PATH=\$(readlink -f ${submission_item}) + sed 's|^FASTA\t.*|FASTA\t'"\${ITEM_FULL_PATH}"'|g' ${manifest} > ${prefix}_updated_manifest.manifest + + webin_cli_handler \\ + -m ${prefix}_updated_manifest.manifest \\ + -o ${prefix}_accessions.tsv \\ + --webin-cli-jar ${webin_cli_jar} \\ + ${submit_or_validate} \\ + ${test_flag} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/ena_webin_cli_wrapper/meta.yml b/modules/local/ena_webin_cli_wrapper/meta.yml new file mode 100644 index 0000000..a0abd93 --- /dev/null +++ b/modules/local/ena_webin_cli_wrapper/meta.yml @@ -0,0 +1,53 @@ +name: "ena_webin_cli_wrapper" +description: Runs ENA Webin-CLI to validate or submit assemblies, using the webin_cli_handler script from mgnify-pipelines-toolkit. +keywords: + - ena + - webin + - submission + - assembly +tools: + - "mgnify-pipelines-toolkit": + description: "A toolkit of utilities for MGnify pipelines, including webin_cli_handler for robust Webin-CLI submission." + homepage: "https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit" + documentation: "https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit" + licence: ["Apache-2.0"] + identifier: null + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1' ]` + - submission_item: + type: file + description: | + FASTA file for submission (assembly, MAG, or bin). + - manifest: + type: file + description: | + Webin-CLI submission manifest file. + - - webin_cli_jar: + type: file + description: | + The Webin-CLI JAR file downloaded by ena_webin_cli_download. + pattern: "webin-cli-*.jar" + +output: + - - accessions: + type: file + description: | + TSV file containing the accession assigned by ENA for the submitted item. + File has two columns: "alias" and "accession". + - - versions: + - "versions.yml": + type: file + description: File containing software versions. + pattern: "versions.yml" + +authors: + - "@KateSakharova" + - "@ochkalova" +maintainers: + - "@KateSakharova" + - "@ochkalova" diff --git a/modules/local/ena_webin_cli_wrapper/nextflow.config b/modules/local/ena_webin_cli_wrapper/nextflow.config new file mode 100644 index 0000000..b2fb248 --- /dev/null +++ b/modules/local/ena_webin_cli_wrapper/nextflow.config @@ -0,0 +1,16 @@ +params { + // Use ENA test server and validate only (no actual submission) + test_upload = true + webincli_submit = false +} + +process { + withName: ENA_WEBIN_CLI_WRAPPER { + ext.args = "" + } +} + +env { + ENA_WEBIN = secrets.ENA_WEBIN + ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD +} diff --git a/modules/local/generate_assembly_manifest/nextflow.config b/modules/local/generate_assembly_manifest/nextflow.config index 9a4037e..495e622 100644 --- a/modules/local/generate_assembly_manifest/nextflow.config +++ b/modules/local/generate_assembly_manifest/nextflow.config @@ -4,6 +4,6 @@ process { } } env { - ENA_WEBIN = secrets.WEBIN_ACCOUNT - ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD + ENA_WEBIN = secrets.ENA_WEBIN + ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD } diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test b/modules/local/generate_assembly_manifest/tests/main.nf.test index d04d7a2..2b96eae 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test @@ -24,22 +24,30 @@ nextflow_process { } then { - assert process.success - assertAll( - { assert snapshot( - sanitizeOutput(process.out), - path(process.out.versions[0]).yaml - ).match() }, - { assert process.out.manifest.size() == 1 }, - { assert process.out.manifest[0][1].toString().endsWith(".manifest") }, - { - def manifestContent = path(process.out.manifest[0][1]).text - assert manifestContent.contains("PRJ12345") : "PRJ12345 not found in manifest file" - } - ) - } + assert process.success + assertAll( + // Only snapshot versions, not the manifest because it contains a timestamp + { assert snapshot( + path(process.out.versions[0]).yaml + ).match() }, + { assert process.out.manifest.size() == 1 }, + { assert process.out.manifest[0][1].toString().endsWith(".manifest") }, + { + // Validate manifest structure + def manifestContent = path(process.out.manifest[0][1]).text + def lines = manifestContent.readLines() + + // Check length of manifest + assert lines.size() == 10 + // Check required fields are present + assert manifestContent.contains("SAMPLE SAMEA7687881") + assert manifestContent.contains("STUDY PRJ12345") + assert manifestContent.contains("RUN_REF ERR4918394") + } + ) } +} test("GENERATE_ASSEMBLY_MANIFEST completes with expected outputs - stub") { diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap index cf8a9e1..054687f 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap @@ -11,7 +11,7 @@ ] ], "1": [ - "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ], "manifest": [ [ @@ -22,46 +22,33 @@ ] ], "versions": [ - "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "assembly_uploader": null + "assembly_uploader": "assembly_uploader 1.3.4" } } ], - "timestamp": "2026-03-13T14:02:14.937082", "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.4" - } + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-25T16:53:50.281438" }, "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": { "content": [ - { - "manifest": [ - [ - { - "id": "test" - }, - "233126d4c4d0.manifest:md5,cacedcfcce220081e7aa2f98c2f4ffd6" - ] - ], - "versions": [ - "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" - ] - }, { "GENERATE_ASSEMBLY_MANIFEST": { "assembly_uploader": "assembly_uploader 1.3.4" } } ], - "timestamp": "2026-03-13T12:32:23.722449", "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.4" - } + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-25T17:34:01.598736" } } \ No newline at end of file diff --git a/modules/local/genome_upload/main.nf b/modules/local/genome_upload/main.nf index 3c5d348..221e8fe 100644 --- a/modules/local/genome_upload/main.nf +++ b/modules/local/genome_upload/main.nf @@ -4,9 +4,6 @@ process GENOME_UPLOAD { container "quay.io/biocontainers/genome-uploader:2.5.1--pyhdfd78af_1" - secret 'WEBIN_ACCOUNT' - secret 'WEBIN_PASSWORD' - input: path(mags) path(table_for_upload) @@ -19,7 +16,7 @@ process GENOME_UPLOAD { path "results/{MAG,bin}_upload/genome_samples.xml" , emit: upload_genome_samples path "results/{MAG,bin}_upload/registered_{MAGs,bins}*.tsv", emit: upload_registered_mags path "results/{MAG,bin}_upload/submission.xml" , emit: upload_submission_xml - path "versions.yml" , emit: versions + tuple val("${task.process}"), val('genome_uploader'), eval("genome_upload --version 2>&1 | sed 's/genome_uploader //g'"), topic: versions, emit: versions_genome_uploader when: task.ext.when == null || task.ext.when @@ -31,9 +28,6 @@ process GENOME_UPLOAD { def mode = (!params.test_upload) ? "--live" : "" """ - export ENA_WEBIN=\$WEBIN_ACCOUNT - export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD - genome_upload \\ -u $submission_study \\ --genome_info ${table_for_upload} \\ @@ -44,10 +38,16 @@ process GENOME_UPLOAD { ${mode} \\ --out results \\ ${args} + """ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - genome_uploader: \$(genome_upload --version 2>&1 | sed 's/genome_uploader //g') - END_VERSIONS + stub: + """ + mkdir results/MAG_upload + touch results/MAG_upload/ENA_backup.json + touch results/MAG_upload/genome_samples.xml + touch results/MAG_upload/submission.xml + touch results/MAG_upload/registered_MAGs_test.tsv + mkdir results/MAG_upload/manifests_test + touch results/MAG_upload/manifests_test/test_1.manifest """ } diff --git a/modules/local/genome_upload/nextflow.config b/modules/local/genome_upload/nextflow.config new file mode 100644 index 0000000..9110b4b --- /dev/null +++ b/modules/local/genome_upload/nextflow.config @@ -0,0 +1,4 @@ +env { + ENA_WEBIN = secrets.ENA_WEBIN + ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD +} diff --git a/modules/local/rename_fasta_for_catpack/tests/main.nf.test.snap b/modules/local/rename_fasta_for_catpack/tests/main.nf.test.snap new file mode 100644 index 0000000..112fec2 --- /dev/null +++ b/modules/local/rename_fasta_for_catpack/tests/main.nf.test.snap @@ -0,0 +1,56 @@ +{ + "RENAME_FASTA_FOR_CATPACK - uncompressed fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,3dc842bfe6f1059ccb006fc454437ebf" + ] + ], + "renamed_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,3dc842bfe6f1059ccb006fc454437ebf" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-25T16:51:01.874131" + }, + "RENAME_FASTA_FOR_CATPACK - compressed fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test_compressed" + }, + "test_compressed.fasta:md5,b349eb81b2fc8db6a6cee870c8d4b054" + ] + ], + "renamed_fasta": [ + [ + { + "id": "test_compressed" + }, + "test_compressed.fasta:md5,b349eb81b2fc8db6a6cee870c8d4b054" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-25T16:51:04.916201" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastavalidator/fastavalidator.diff b/modules/nf-core/fastavalidator/fastavalidator.diff index 4937db2..8575327 100644 --- a/modules/nf-core/fastavalidator/fastavalidator.diff +++ b/modules/nf-core/fastavalidator/fastavalidator.diff @@ -7,9 +7,9 @@ Changes in 'fastavalidator/meta.yml': description: Input fasta file pattern: "*.fasta" ontologies: [] -+ - - is_metagenome: ++ - - count_contigs: + type: boolean -+ description: True if the fasta file is from a metagenome. Enables number of contigs check. ++ description: Enables number of contigs check (ENA requires more than 1 contig for a contig-level assembly submission) output: success_log: - - meta: @@ -21,7 +21,7 @@ Changes in 'fastavalidator/main.nf': input: tuple val(meta), path(fasta) -+ val(is_metagenome) ++ val(count_contigs) output: - tuple val(meta), path('*.success.log') , emit: success_log , optional: true @@ -49,7 +49,7 @@ Changes in 'fastavalidator/main.nf': + # One more check: count contigs. More than 1 contig required. + echo "[INFO] Checking contig count..." + -+ if [ "${is_metagenome}" = true ]; then ++ if [ "${count_contigs}" = true ]; then + if [[ "${fasta}" == *.gz ]]; then + CONTIGS=\$(zcat "${fasta}" | grep -c '^>') + else diff --git a/modules/nf-core/fastavalidator/main.nf b/modules/nf-core/fastavalidator/main.nf index 87db6ca..c662fc9 100644 --- a/modules/nf-core/fastavalidator/main.nf +++ b/modules/nf-core/fastavalidator/main.nf @@ -9,7 +9,7 @@ process FASTAVALIDATOR { input: tuple val(meta), path(fasta) - val(is_metagenome) + val(count_contigs) output: tuple val(meta), path('*.success.log'), emit: success_log , optional: true @@ -33,7 +33,7 @@ process FASTAVALIDATOR { # One more check: count contigs. More than 1 contig required. echo "[INFO] Checking contig count..." - if [ "${is_metagenome}" = true ]; then + if [ "${count_contigs}" = true ]; then if [[ "${fasta}" == *.gz ]]; then CONTIGS=\$(zcat "${fasta}" | grep -c '^>') else diff --git a/modules/nf-core/fastavalidator/meta.yml b/modules/nf-core/fastavalidator/meta.yml index 35083d2..6d38dde 100644 --- a/modules/nf-core/fastavalidator/meta.yml +++ b/modules/nf-core/fastavalidator/meta.yml @@ -30,9 +30,9 @@ input: description: Input fasta file pattern: "*.fasta" ontologies: [] - - - is_metagenome: + - - count_contigs: type: boolean - description: True if the fasta file is from a metagenome. Enables number of contigs check. + description: Enables number of contigs check (ENA requires more than 1 contig for a contig-level assembly submission) output: success_log: - - meta: diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index c1158fb..005e36a 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -2,9 +2,10 @@ process MULTIQC { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8c/8c6c120d559d7ee04c7442b61ad7cf5a9e8970be5feefb37d68eeaa60c1034eb/data' : - 'community.wave.seqera.io/library/multiqc:1.32--d58f60e4deb769bf' }" + // TODO: version is temporarily set to 1.25.1 to avoid segfault happening in 1.32 + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'oras://community.wave.seqera.io/library/multiqc:1.25.1--6d0dfb7dbe16fbf9' + : 'community.wave.seqera.io/library/multiqc:1.25.1--214d24b7702218de'}" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/multiqc.diff b/modules/nf-core/multiqc/multiqc.diff new file mode 100644 index 0000000..e0c8c7b --- /dev/null +++ b/modules/nf-core/multiqc/multiqc.diff @@ -0,0 +1,65 @@ +Changes in component 'nf-core/multiqc' +'modules/nf-core/multiqc/environment.yml' is unchanged +'modules/nf-core/multiqc/meta.yml' is unchanged +Changes in 'multiqc/main.nf': +--- modules/nf-core/multiqc/main.nf ++++ modules/nf-core/multiqc/main.nf +@@ -2,9 +2,10 @@ + label 'process_single' + + conda "${moduleDir}/environment.yml" +- container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8c/8c6c120d559d7ee04c7442b61ad7cf5a9e8970be5feefb37d68eeaa60c1034eb/data' : +- 'community.wave.seqera.io/library/multiqc:1.32--d58f60e4deb769bf' }" ++ // TODO: version is temporarily set to 1.25.1 to avoid segfault happening in 1.32 ++ container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ++ ? 'oras://community.wave.seqera.io/library/multiqc:1.25.1--6d0dfb7dbe16fbf9' ++ : 'community.wave.seqera.io/library/multiqc:1.25.1--214d24b7702218de'}" + + input: + path multiqc_files, stageAs: "?/*" + +'modules/nf-core/multiqc/tests/main.nf.test.snap' is unchanged +Changes in 'multiqc/tests/nextflow.config': +--- modules/nf-core/multiqc/tests/nextflow.config ++++ modules/nf-core/multiqc/tests/nextflow.config +@@ -2,4 +2,4 @@ + withName: 'MULTIQC' { + ext.prefix = null + } +-} ++} +Changes in 'multiqc/tests/main.nf.test': +--- modules/nf-core/multiqc/tests/main.nf.test ++++ modules/nf-core/multiqc/tests/main.nf.test +@@ -15,7 +15,7 @@ + when { + process { + """ +- input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) ++ input[0] = channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] +@@ -41,8 +41,8 @@ + when { + process { + """ +- input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) +- input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) ++ input[0] = channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) ++ input[1] = channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + input[4] = [] +@@ -68,7 +68,7 @@ + when { + process { + """ +- input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) ++ input[0] = channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] + +************************************************************ diff --git a/nextflow.config b/nextflow.config index 1cb8aff..21871a0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -5,7 +5,6 @@ Default config options for all compute environments ---------------------------------------------------------------------------------------- */ - // Global default params, used in configs params { @@ -21,6 +20,7 @@ params { upload_force = true test_upload = true webincli_submit = true + webin_cli_version = "9.0.3" // rna detection rrna_limit = 80 @@ -183,12 +183,21 @@ profiles { singularity.runOptions = '--nv' } // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies) - test { includeConfig 'conf/test.config' } - test_genome { includeConfig 'conf/test_genome.config' } - test_assembly { includeConfig 'conf/test_assembly.config' } - test_full { includeConfig 'conf/test_full.config' } - test_assembly_no_study_complete_metadata { includeConfig 'conf/test_assembly_no_study_complete_metadata.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_mag_complete_metadata { includeConfig 'conf/test_mag_complete_metadata.conf' } test_mag_no_study_complete_metadata { includeConfig 'conf/test_mag_no_study_complete_metadata.config' } + test_mag_no_coverage_single_reads { includeConfig 'conf/test_mag_no_coverage_single_reads.conf' } + test_mag_no_coverage_paired_reads { includeConfig 'conf/test_mag_no_coverage_paired_reads.conf' } + test_mag_no_quality { includeConfig 'conf/test_mag_no_quality.conf' } + test_mag_one_contig { includeConfig 'conf/test_mag_one_contig.conf' } + test_mag_no_rna_presence { includeConfig 'conf/test_mag_no_rna_presence.conf' } + test_mag_no_taxonomy { includeConfig 'conf/test_mag_no_taxonomy.conf' } + test_assembly_complete_metadata { includeConfig 'conf/test_assembly_complete_metadata.config' } + test_assembly_no_study_complete_metadata { includeConfig 'conf/test_assembly_no_study_complete_metadata.config' } + test_assembly_no_coverage_single_reads { includeConfig 'conf/test_assembly_no_coverage_single_reads.config' } + test_assembly_no_coverage_paired_reads { includeConfig 'conf/test_assembly_no_coverage_paired_reads.config' } + test_assembly_one_contig { includeConfig 'conf/test_assembly_one_contig.config' } } // Load nf-core custom profiles from different institutions @@ -222,8 +231,8 @@ env { R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" - ENA_WEBIN = secrets.WEBIN_ACCOUNT - ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD + ENA_WEBIN = secrets.ENA_WEBIN + ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD } // Set bash options diff --git a/nextflow_schema.json b/nextflow_schema.json index 83b1ed2..9f52527 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -51,7 +51,7 @@ "properties": { "rrna_limit": { "type": "number", - "description": "Minimum number of rRNA.", + "description": "Minimum percentage of 16S, 23S, and 5S rRNA gene length recovered to count the gene as present.", "default": 80, "fa_icon": "fas fa-users-cog" }, @@ -72,7 +72,7 @@ "properties": { "cat_db": { "type": "string", - "description": "Path to CAT_pack DB", + "description": "Path to local CAT_pack DB", "help": "Database should be pre-downloaded using CATpack commands. The folder with database should contain sub-folders 'db' with files .dmnd, .fastaid2LCAtaxid, .taxids_with_multiple_offspring, etc. And subfolder 'tax' that should contain taxonomy files like .dmp, .taxdump.tar.gz, .prot.accession2taxid.FULL.gz, etc", "fa_icon": "fas fa-users-cog" }, @@ -294,9 +294,9 @@ }, "upload_force": { "type": "boolean", - "description": "Flag to control upload retries", + "description": "Enables force mode for genome_uploader (used for MAGs/BINs submission)", "default": true, - "help": "Forces reset of sample xmls generation. This is useful if you changed something in your tsv table, or if ENA metadata haven't been downloaded correctly (you can check this in ENA_backup.json). Default: true" + "help": "Forces reset of bin/MAG sample xmls generation. This is useful if you changed something in your tsv table, or if ENA metadata haven't been downloaded correctly (you can check this in ENA_backup.json). Default: true" }, "submission_study": { "type": "string", @@ -316,6 +316,12 @@ "description": "Submit or validate", "default": true, "help": "Flag to run submission or validation. Submission (true) will run upload of data with ena-webin-cli. Validation (false) validates correctness of input files, it does not do submission. Default: true (submit)" + }, + "webin_cli_version": { + "type": "string", + "description": "Version of webon-cli.jar to use for submission", + "default": "9.0.3", + "help": "Check version https://github.com/enasequence/webin-cli" } } } diff --git a/nf-test.config b/nf-test.config index 613fc05..65efb9f 100644 --- a/nf-test.config +++ b/nf-test.config @@ -15,7 +15,7 @@ config { profile "test" // list of filenames or patterns that should be trigger a full test run - triggers 'nextflow.config', 'nf-test.config', 'conf/test_genome.config', 'conf/test_assembly.config', 'tests/nextflow.config', 'tests/.nftignore' + triggers 'nextflow.config', 'nf-test.config', 'tests/nextflow.config', 'tests/.nftignore' // load the necessary plugins plugins { diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 3ff34eb..7e0e70b 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-11-20T09:32:34+00:00", - "description": "

\n \n \n \"nf-core/seqsubmit\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\nFor detailed descriptions of all samplesheet columns, see the [usage documentation](docs/usage.md#samplesheet-input).\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `RNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package.\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter | Description |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input` | Path to the samplesheet describing the data to be submitted |\n| `--outdir` | Path to the output directory for pipeline results |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to |\n| `--centre_name` | Name of the submitter's organisation |\n\n### Optional parameters:\n\n| Parameter | Description |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: false |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile \\\n --mode \\\n --input \\\n --centre_name \\\n --submission_study \\\n --outdir \n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode mags \\\n --input assets/samplesheet_genomes.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study PRJEB98843 \\\n --test_upload false \\\n --webincli_submit true \\\n --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/seqsubmit\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set ENA_WEBIN \"Webin-XXX\"`\n\n`nextflow secrets set ENA_WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\nFor detailed descriptions of all samplesheet columns, see the [usage documentation](docs/usage.md#samplesheet-input).\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz`, `.fasta.gz`, or `.fna.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `genome_coverage`\n\nAdditional supported columns:\n\n- `stats_generation_software`\n- `completeness`\n- `contamination`\n- `RNA_presence`\n- `NCBI_lineage`\n\nIf `genome_coverage`, `stats_generation_software`, `completeness`, `contamination`, `RNA_presence`, or `NCBI_lineage` are missing, the workflow can calculate or infer them when the required inputs are available.\n\nThose fields are metadata required for the [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package.\n\nExample `samplesheet_genomes.csv`:\n\n```csv\nsample,fasta,accession,fastq_1,fastq_2,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,,,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s__unclassified_Proteobacteria\n```\n\n> [!IMPORTANT]\n> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown.\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz`, `.fasta.gz`, or `.fna.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n> [!IMPORTANT]\n> **Samplesheet column requirements**: All columns shown in the example above must be present in your samplesheet, even if some values are empty. Columns must be in exactly the same order as shown.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Submission study\n\nAll data submitted through this pipeline must be associated with an ENA study (project). You can either pass an accession of your existing study via `--submission_study`or provide a metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data.\n\nSee the [usage documentation](docs/usage.md#submission-study) for more details.\n\n### Database setup (`CheckM2` and `CAT_pack`)\n\nThe `mags`/`bins` workflow requires databases for completeness/contamination estimation and taxonomy assignment. See [Usage documentation](usage.md) for details.\n\n### Required parameters:\n\n| Parameter | Description |\n| ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------- |\n| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input` | Path to the samplesheet describing the data to be submitted |\n| `--outdir` | Path to the output directory for pipeline results |\n| `--submission_study` OR `--study_metadata` | ENA study accession (PRJ/ERP) to submit the data to OR metadata file in JSON/TSV/CSV format to register new study |\n| `--centre_name` | Name of the submitter's organisation |\n\n### Optional parameters:\n\n| Parameter | Description |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: true |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile \\\n --mode \\\n --input \\\n --centre_name \\\n --submission_study \\\n --outdir \n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode mags \\\n --input assets/samplesheet_genomes.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study PRJEB98843 \\\n --test_upload false \\\n --webincli_submit true \\\n --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `mags/` or `bins/`: genome metadata, manifests, and per-sample submission support files\n- `metagenomic_assemblies/`: assembly metadata CSVs and per-sample coverage files\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" diff --git a/subworkflows/local/genome_evaluation.nf b/subworkflows/local/genome_evaluation.nf index b80b854..6eb4fd9 100644 --- a/subworkflows/local/genome_evaluation.nf +++ b/subworkflows/local/genome_evaluation.nf @@ -20,18 +20,27 @@ include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict workflow GENOME_EVALUATION { take: - ch_fasta // [meta, fasta_file] + ch_fasta // channel: [ val(meta), path(fasta) ] main: ch_versions = channel.empty() - // Run checkM2 database download if there is no db path provided + // + // Database preparation + // + if (!params.checkm2_db || !file(params.checkm2_db).exists()) { - CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_zenodo_id) - ch_check2_db = CHECKM2_DATABASEDOWNLOAD.out.database + // Conditional download: only trigger if ch_fasta has items + ch_download_trigger = ch_fasta + .map { _meta, _fasta -> params.checkm2_db_zenodo_id } + .first() // Only need one trigger regardless of how many fasta files + + CHECKM2_DATABASEDOWNLOAD(ch_download_trigger) + ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database } else { - ch_check2_db = channel.of( + // Use existing database + ch_checkm2_db = channel.of( [ [id: "checkm2_db"], file(params.checkm2_db), @@ -39,13 +48,17 @@ workflow GENOME_EVALUATION { ) } + // + // Genome evaluation + // + CHECKM2_PREDICT( ch_fasta, - ch_check2_db.first(), + ch_checkm2_db, ) emit: - genome_evaluation = CHECKM2_PREDICT.out.checkm2_tsv // [meta, stats.tsv] - stats_versions = CHECKM2_PREDICT.out.versions_checkm2_predict + genome_evaluation = CHECKM2_PREDICT.out.checkm2_tsv // channel: [ val(meta), path(tsv) ] + stats_versions = CHECKM2_PREDICT.out.versions_checkm2_predict } diff --git a/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff b/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff index 6a541ba..e704c20 100644 --- a/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff +++ b/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff @@ -21,6 +21,24 @@ Changes in 'fasta_classify_catpack/main.nf': ch_cat_db_input.other.subscribe { _meta, _db -> error("Error: A DB was provided to FASTA_CLASSIFY_CATPACK that is not a `.tar.gz` or a directory.") +@@ -51,8 +55,15 @@ + taxonomy: [meta, dir / 'tax'] + } + +- // Download and prepare db from scratch if no pre-built db provided +- CATPACK_DOWNLOAD(ch_cat_db_download_id) ++ // Download and prepare db from scratch if no pre-built db provided - only trigger if ch_bins OR ch_contigs has items ++ // Mix both channels and use first item to trigger download once ++ ch_download_trigger = ch_bins ++ .mix(ch_contigs) ++ .first() ++ .combine(ch_cat_db_download_id) ++ .map { _meta, _fasta, db_meta, db_id -> [db_meta, db_id] } ++ ++ CATPACK_DOWNLOAD(ch_download_trigger) + + CATPACK_PREPARE( + CATPACK_DOWNLOAD.out.fasta, 'subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test.snap' is unchanged 'subworkflows/nf-core/fasta_classify_catpack/tests/nextflow.config' is unchanged diff --git a/subworkflows/nf-core/fasta_classify_catpack/main.nf b/subworkflows/nf-core/fasta_classify_catpack/main.nf index e362a5c..1d024ea 100644 --- a/subworkflows/nf-core/fasta_classify_catpack/main.nf +++ b/subworkflows/nf-core/fasta_classify_catpack/main.nf @@ -55,8 +55,15 @@ workflow FASTA_CLASSIFY_CATPACK { taxonomy: [meta, dir / 'tax'] } - // Download and prepare db from scratch if no pre-built db provided - CATPACK_DOWNLOAD(ch_cat_db_download_id) + // Download and prepare db from scratch if no pre-built db provided - only trigger if ch_bins OR ch_contigs has items + // Mix both channels and use first item to trigger download once + ch_download_trigger = ch_bins + .mix(ch_contigs) + .first() + .combine(ch_cat_db_download_id) + .map { _meta, _fasta, db_meta, db_id -> [db_meta, db_id] } + + CATPACK_DOWNLOAD(ch_download_trigger) CATPACK_PREPARE( CATPACK_DOWNLOAD.out.fasta, diff --git a/tests/.nftignore b/tests/.nftignore index 83f7a0a..b99c781 100644 --- a/tests/.nftignore +++ b/tests/.nftignore @@ -8,3 +8,17 @@ multiqc/multiqc_data/llms-full.txt multiqc/multiqc_plots/{svg,pdf,png}/*.{svg,pdf,png} multiqc/multiqc_report.html pipeline_info/*.{html,json,txt,yml} +**_webin-cli.report +**/webin_cli/*accessions.tsv +**/MAG_upload/registered_MAGs.tsv +**/MAG_upload/registered_MAGs_test.tsv +**/bin_upload/registered_bins.tsv +**/bin_upload/registered_bins_test.tsv +**/MAG_upload/manifests/ +**/bin_upload/manifests/ +**/MAG_upload/manifests_test/* +**/bin_upload/manifests_test/* +**/MAG_upload/genome_samples.xml +**/bin_upload/genome_samples.xml +**/rna/trnascanse/*.stats +**/rna/trnascanse/*.log diff --git a/tests/assembly_complete_metadata.nf.test b/tests/assembly_complete_metadata.nf.test new file mode 100644 index 0000000..bad0eea --- /dev/null +++ b/tests/assembly_complete_metadata.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test assembly submission workflow stub - complete_metadata" + script "../main.nf" + tag "pipeline" + tag "mode_assembly" + tag "test_assembly_complete_metadata" + profile "test_assembly_complete_metadata" + + test("-profile test_assembly_complete_metadata") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/assembly_complete_metadata.nf.test.snap b/tests/assembly_complete_metadata.nf.test.snap new file mode 100644 index 0000000..12945aa --- /dev/null +++ b/tests/assembly_complete_metadata.nf.test.snap @@ -0,0 +1,39 @@ +{ + "-profile test_assembly_complete_metadata": { + "content": [ + 5, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "metagenomic_assemblies", + "metagenomic_assemblies/complete_metadata", + "metagenomic_assemblies/complete_metadata/upload", + "metagenomic_assemblies/complete_metadata/upload/webin_cli", + "metagenomic_assemblies/complete_metadata/upload/webin_cli/complete_metadata_accessions.tsv", + "metagenomic_assemblies/complete_metadata_assembly_metadata.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "complete_metadata_assembly_metadata.csv:md5,d5b1575095ece78d988395b874440bef", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:17:10.294831" + } +} \ No newline at end of file diff --git a/tests/assembly_no_coverage_paired_reads.nf.test b/tests/assembly_no_coverage_paired_reads.nf.test new file mode 100644 index 0000000..41f2b64 --- /dev/null +++ b/tests/assembly_no_coverage_paired_reads.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test assembly submission workflow stub - no_coverage_paired_reads" + script "../main.nf" + tag "pipeline" + tag "mode_assembly" + tag "test_assembly_no_coverage_paired_reads" + profile "test_assembly_no_coverage_paired_reads" + + test("-profile test_assembly_no_coverage_paired_reads") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/assembly_no_coverage_paired_reads.nf.test.snap b/tests/assembly_no_coverage_paired_reads.nf.test.snap new file mode 100644 index 0000000..3f62ac9 --- /dev/null +++ b/tests/assembly_no_coverage_paired_reads.nf.test.snap @@ -0,0 +1,42 @@ +{ + "-profile test_assembly_no_coverage_paired_reads": { + "content": [ + 6, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "metagenomic_assemblies", + "metagenomic_assemblies/no_coverage_paired_reads", + "metagenomic_assemblies/no_coverage_paired_reads/coverage", + "metagenomic_assemblies/no_coverage_paired_reads/coverage/no_coverage_paired_reads.depth.txt", + "metagenomic_assemblies/no_coverage_paired_reads/upload", + "metagenomic_assemblies/no_coverage_paired_reads/upload/webin_cli", + "metagenomic_assemblies/no_coverage_paired_reads/upload/webin_cli/no_coverage_paired_reads_accessions.tsv", + "metagenomic_assemblies/no_coverage_paired_reads_assembly_metadata.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "no_coverage_paired_reads.depth.txt:md5,bb5f99e74d21df3c73e0ae0f388bcbcb", + "no_coverage_paired_reads_assembly_metadata.csv:md5,91a2616ccedc6bb93c2209153bec50f0", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:17:51.720414" + } +} \ No newline at end of file diff --git a/tests/assembly_no_coverage_single_reads.nf.test b/tests/assembly_no_coverage_single_reads.nf.test new file mode 100644 index 0000000..f990601 --- /dev/null +++ b/tests/assembly_no_coverage_single_reads.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test assembly submission workflow stub - no_coverage_single_reads" + script "../main.nf" + tag "pipeline" + tag "mode_assembly" + tag "test_assembly_no_coverage_single_reads" + profile "test_assembly_no_coverage_single_reads" + + test("-profile test_assembly_no_coverage_single_reads") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/assembly_no_coverage_single_reads.nf.test.snap b/tests/assembly_no_coverage_single_reads.nf.test.snap new file mode 100644 index 0000000..dea32b5 --- /dev/null +++ b/tests/assembly_no_coverage_single_reads.nf.test.snap @@ -0,0 +1,42 @@ +{ + "-profile test_assembly_no_coverage_single_reads": { + "content": [ + 6, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "metagenomic_assemblies", + "metagenomic_assemblies/no_coverage_single_reads", + "metagenomic_assemblies/no_coverage_single_reads/coverage", + "metagenomic_assemblies/no_coverage_single_reads/coverage/no_coverage_single_reads.depth.txt", + "metagenomic_assemblies/no_coverage_single_reads/upload", + "metagenomic_assemblies/no_coverage_single_reads/upload/webin_cli", + "metagenomic_assemblies/no_coverage_single_reads/upload/webin_cli/no_coverage_single_reads_accessions.tsv", + "metagenomic_assemblies/no_coverage_single_reads_assembly_metadata.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "no_coverage_single_reads.depth.txt:md5,bd88282b17e851377b1dd223839be150", + "no_coverage_single_reads_assembly_metadata.csv:md5,fc1beef7e8439eaf5329e02587460009", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:18:28.483809" + } +} \ No newline at end of file diff --git a/tests/assembly_no_study_complete_metadata.nf.test.snap b/tests/assembly_no_study_complete_metadata.nf.test.snap new file mode 100644 index 0000000..f2c272d --- /dev/null +++ b/tests/assembly_no_study_complete_metadata.nf.test.snap @@ -0,0 +1,42 @@ +{ + "-profile test_assembly_no_study_complete_metadata": { + "content": [ + 6, + { + "REGISTERSTUDY": { + "mgnify-pipelines-toolkit": "1.4.21" + }, + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "metagenomic_assemblies", + "metagenomic_assemblies/complete_metadata", + "metagenomic_assemblies/complete_metadata/upload", + "metagenomic_assemblies/complete_metadata/upload/webin_cli", + "metagenomic_assemblies/complete_metadata/upload/webin_cli/complete_metadata_accessions.tsv", + "metagenomic_assemblies/complete_metadata_assembly_metadata.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "complete_metadata_assembly_metadata.csv:md5,d5b1575095ece78d988395b874440bef", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:20:59.186788" + } +} \ No newline at end of file diff --git a/tests/assembly_one_contig.nf.test b/tests/assembly_one_contig.nf.test new file mode 100644 index 0000000..2ce8f4e --- /dev/null +++ b/tests/assembly_one_contig.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test assembly submission workflow stub - one_contig" + script "../main.nf" + tag "pipeline" + tag "mode_assembly" + tag "test_assembly_one_contig" + profile "test_assembly_one_contig" + + test("-profile test_assembly_one_contig") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/assembly_one_contig.nf.test.snap b/tests/assembly_one_contig.nf.test.snap new file mode 100644 index 0000000..2ebe8b9 --- /dev/null +++ b/tests/assembly_one_contig.nf.test.snap @@ -0,0 +1,32 @@ +{ + "-profile test_assembly_one_contig": { + "content": [ + 3, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:21:16.358644" + } +} \ No newline at end of file diff --git a/tests/default.nf.test b/tests/default.nf.test index 44f2465..4b446ab 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -3,6 +3,8 @@ nextflow_pipeline { name "Test pipeline" script "../main.nf" tag "pipeline" + tag "mode_mag" + tag "test_mag_multiple_bins_missing_metadata" test("-profile test") { @@ -14,12 +16,15 @@ nextflow_pipeline { then { // stable_name: All files + folders in ${params.outdir}/ with a stable name - def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) // stable_path: All files in ${params.outdir}/ with stable content def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success assertAll( - { assert workflow.success}, { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), // All stable path name, with a relative path diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 0000000..e480827 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,87 @@ +{ + "-profile test": { + "content": [ + 17, + { + "BARRNAP": { + "barrnap": 0.9 + }, + "COUNT_RNA": { + "python": "3.8.13" + }, + "COVERM_GENOME": { + "coverm": "0.7.0" + }, + "TRNASCANSE": { + "tRNAscan-SE": "2.0.12" + }, + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/genomes_metadata.csv", + "mags/no_coverage_paired_reads", + "mags/no_coverage_paired_reads/coverage", + "mags/no_coverage_paired_reads/coverage/no_coverage_paired_reads.tsv", + "mags/no_coverage_paired_reads/upload", + "mags/no_coverage_paired_reads/upload/webin_cli", + "mags/no_coverage_paired_reads/upload/webin_cli/no_coverage_paired_reads_accessions.tsv", + "mags/no_rna_presence", + "mags/no_rna_presence/rna", + "mags/no_rna_presence/rna/barrnap", + "mags/no_rna_presence/rna/barrnap/no_rna_presence_bac.gff", + "mags/no_rna_presence/rna/trnascanse", + "mags/no_rna_presence/rna/trnascanse/no_rna_presence.log", + "mags/no_rna_presence/rna/trnascanse/no_rna_presence.stats", + "mags/no_rna_presence/rna/trnascanse/no_rna_presence.tsv", + "mags/no_rna_presence/upload", + "mags/no_rna_presence/upload/webin_cli", + "mags/no_rna_presence/upload/webin_cli/no_rna_presence_accessions.tsv", + "mags/no_taxonomy", + "mags/no_taxonomy/taxonomy", + "mags/no_taxonomy/taxonomy/no_taxonomy.bin2classification.txt", + "mags/no_taxonomy/taxonomy/no_taxonomy.txt", + "mags/no_taxonomy/upload", + "mags/no_taxonomy/upload/webin_cli", + "mags/no_taxonomy/upload/webin_cli/no_taxonomy_accessions.tsv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,a75d1d35c762863c487f010f6a000910", + "no_coverage_paired_reads.tsv:md5,fb8374996c3bad9ddd296684d8c28628", + "no_rna_presence_bac.gff:md5,df19e1b84ba6f691d20c72b397c88abf", + "no_rna_presence.tsv:md5,abd958e8addba39c9e4d7f6b97a1a7c6", + "no_taxonomy.bin2classification.txt:md5,e24c109efe807c7044a866999fd736f1", + "no_taxonomy.txt:md5,0f2082d3e397fd630d605dd60993b131", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T13:44:33.345037" + } +} \ No newline at end of file diff --git a/tests/mag_complete_metadata.nf.test b/tests/mag_complete_metadata.nf.test new file mode 100644 index 0000000..9b42848 --- /dev/null +++ b/tests/mag_complete_metadata.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow - complete_metadata" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_complete_metadata" + profile "test_mag_complete_metadata" + + test("-profile test_mag_complete_metadata") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_complete_metadata.nf.test.snap b/tests/mag_complete_metadata.nf.test.snap new file mode 100644 index 0000000..fff69a6 --- /dev/null +++ b/tests/mag_complete_metadata.nf.test.snap @@ -0,0 +1,50 @@ +{ + "-profile test_mag_complete_metadata": { + "content": [ + 5, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/complete_metadata", + "mags/complete_metadata/upload", + "mags/complete_metadata/upload/webin_cli", + "mags/complete_metadata/upload/webin_cli/complete_metadata_accessions.tsv", + "mags/genomes_metadata.csv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,ae2b884015d1850fa63365e8a64d9fc8", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:22:02.015517" + } +} \ No newline at end of file diff --git a/tests/mag_no_coverage_paired_reads.nf.test b/tests/mag_no_coverage_paired_reads.nf.test new file mode 100644 index 0000000..791428a --- /dev/null +++ b/tests/mag_no_coverage_paired_reads.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow stub - no_coverage_paired_reads" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_no_coverage_paired_reads" + profile "test_mag_no_coverage_paired_reads" + + test("-profile test_mag_no_coverage_paired_reads") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_no_coverage_paired_reads.nf.test.snap b/tests/mag_no_coverage_paired_reads.nf.test.snap new file mode 100644 index 0000000..b0ccfa1 --- /dev/null +++ b/tests/mag_no_coverage_paired_reads.nf.test.snap @@ -0,0 +1,56 @@ +{ + "-profile test_mag_no_coverage_paired_reads": { + "content": [ + 6, + { + "COVERM_GENOME": { + "coverm": "0.7.0" + }, + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/genomes_metadata.csv", + "mags/no_coverage_paired_reads", + "mags/no_coverage_paired_reads/coverage", + "mags/no_coverage_paired_reads/coverage/no_coverage_paired_reads.tsv", + "mags/no_coverage_paired_reads/upload", + "mags/no_coverage_paired_reads/upload/webin_cli", + "mags/no_coverage_paired_reads/upload/webin_cli/no_coverage_paired_reads_accessions.tsv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,0538b1aec26168b35a416bd995b0a4a9", + "no_coverage_paired_reads.tsv:md5,fb8374996c3bad9ddd296684d8c28628", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:32:54.180853" + } +} \ No newline at end of file diff --git a/tests/mag_no_coverage_single_reads.nf.test b/tests/mag_no_coverage_single_reads.nf.test new file mode 100644 index 0000000..4f7d22f --- /dev/null +++ b/tests/mag_no_coverage_single_reads.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow stub - no_coverage_single_reads" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_no_coverage_single_reads" + profile "test_mag_no_coverage_single_reads" + + test("-profile test_mag_no_coverage_single_reads") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_no_coverage_single_reads.nf.test.snap b/tests/mag_no_coverage_single_reads.nf.test.snap new file mode 100644 index 0000000..a95ad24 --- /dev/null +++ b/tests/mag_no_coverage_single_reads.nf.test.snap @@ -0,0 +1,56 @@ +{ + "-profile test_mag_no_coverage_single_reads": { + "content": [ + 6, + { + "COVERM_GENOME": { + "coverm": "0.7.0" + }, + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/genomes_metadata.csv", + "mags/no_coverage_single_reads", + "mags/no_coverage_single_reads/coverage", + "mags/no_coverage_single_reads/coverage/no_coverage_single_reads.tsv", + "mags/no_coverage_single_reads/upload", + "mags/no_coverage_single_reads/upload/webin_cli", + "mags/no_coverage_single_reads/upload/webin_cli/no_coverage_single_reads_accessions.tsv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,1f56050125362470f351ed99065af980", + "no_coverage_single_reads.tsv:md5,3791d9be880cbfacbc527e47623e3c9a", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:33:41.342215" + } +} \ No newline at end of file diff --git a/tests/mag_no_rna_presence.nf.test b/tests/mag_no_rna_presence.nf.test new file mode 100644 index 0000000..07ed498 --- /dev/null +++ b/tests/mag_no_rna_presence.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow stub - no_rna_presence" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_no_rna_presence" + profile "test_mag_no_rna_presence" + + test("-profile test_mag_no_rna_presence") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_no_rna_presence.nf.test.snap b/tests/mag_no_rna_presence.nf.test.snap new file mode 100644 index 0000000..de398e7 --- /dev/null +++ b/tests/mag_no_rna_presence.nf.test.snap @@ -0,0 +1,68 @@ +{ + "-profile test_mag_no_rna_presence": { + "content": [ + 8, + { + "BARRNAP": { + "barrnap": 0.9 + }, + "COUNT_RNA": { + "python": "3.8.13" + }, + "TRNASCANSE": { + "tRNAscan-SE": "2.0.12" + }, + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/genomes_metadata.csv", + "mags/no_rna_presence", + "mags/no_rna_presence/rna", + "mags/no_rna_presence/rna/barrnap", + "mags/no_rna_presence/rna/barrnap/no_rna_presence_bac.gff", + "mags/no_rna_presence/rna/trnascanse", + "mags/no_rna_presence/rna/trnascanse/no_rna_presence.log", + "mags/no_rna_presence/rna/trnascanse/no_rna_presence.stats", + "mags/no_rna_presence/rna/trnascanse/no_rna_presence.tsv", + "mags/no_rna_presence/upload", + "mags/no_rna_presence/upload/webin_cli", + "mags/no_rna_presence/upload/webin_cli/no_rna_presence_accessions.tsv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,0f567491f038d4a0c1dbf4c05cdc26c0", + "no_rna_presence_bac.gff:md5,df19e1b84ba6f691d20c72b397c88abf", + "no_rna_presence.tsv:md5,abd958e8addba39c9e4d7f6b97a1a7c6", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:38:24.654035" + } +} \ No newline at end of file diff --git a/tests/mag_no_study_complete_metadata.nf.test.snap b/tests/mag_no_study_complete_metadata.nf.test.snap new file mode 100644 index 0000000..11d461d --- /dev/null +++ b/tests/mag_no_study_complete_metadata.nf.test.snap @@ -0,0 +1,53 @@ +{ + "-profile test_mag_no_study_complete_metadata": { + "content": [ + 6, + { + "REGISTERSTUDY": { + "mgnify-pipelines-toolkit": "1.4.21" + }, + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/complete_metadata", + "mags/complete_metadata/upload", + "mags/complete_metadata/upload/webin_cli", + "mags/complete_metadata/upload/webin_cli/complete_metadata_accessions.tsv", + "mags/genomes_metadata.csv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,ae2b884015d1850fa63365e8a64d9fc8", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:39:07.952593" + } +} \ No newline at end of file diff --git a/tests/mag_no_taxonomy.nf.test b/tests/mag_no_taxonomy.nf.test new file mode 100644 index 0000000..e820278 --- /dev/null +++ b/tests/mag_no_taxonomy.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow stub - no_taxonomy" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_no_taxonomy" + profile "test_mag_no_taxonomy" + + test("-profile test_mag_no_taxonomy") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_no_taxonomy.nf.test.snap b/tests/mag_no_taxonomy.nf.test.snap new file mode 100644 index 0000000..b7ad2b2 --- /dev/null +++ b/tests/mag_no_taxonomy.nf.test.snap @@ -0,0 +1,55 @@ +{ + "-profile test_mag_no_taxonomy": { + "content": [ + 9, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "mags/genomes_metadata.csv", + "mags/no_taxonomy", + "mags/no_taxonomy/taxonomy", + "mags/no_taxonomy/taxonomy/no_taxonomy.bin2classification.txt", + "mags/no_taxonomy/taxonomy/no_taxonomy.txt", + "mags/no_taxonomy/upload", + "mags/no_taxonomy/upload/webin_cli", + "mags/no_taxonomy/upload/webin_cli/no_taxonomy_accessions.tsv", + "mags/upload", + "mags/upload/manifests", + "mags/upload/manifests/results", + "mags/upload/manifests/results/MAG_upload", + "mags/upload/manifests/results/MAG_upload/ENA_backup.json", + "mags/upload/manifests/results/MAG_upload/genome_samples.xml", + "mags/upload/manifests/results/MAG_upload/manifests_test", + "mags/upload/manifests/results/MAG_upload/registered_MAGs_test.tsv", + "mags/upload/manifests/results/MAG_upload/submission.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "genomes_metadata.csv:md5,b1d01539d787b77e30b9bacd5b23d51f", + "no_taxonomy.bin2classification.txt:md5,e24c109efe807c7044a866999fd736f1", + "no_taxonomy.txt:md5,0f2082d3e397fd630d605dd60993b131", + "ENA_backup.json:md5,e339bef4b1e1e11c37ef72f3d9ef2c93", + "submission.xml:md5,4b7d1d7e1b883a3eac57e1267943a8d6", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:43:17.099315" + } +} \ No newline at end of file diff --git a/tests/mag_one_contig.nf.test b/tests/mag_one_contig.nf.test new file mode 100644 index 0000000..6ce83bb --- /dev/null +++ b/tests/mag_one_contig.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow stub - one_contig" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_one_contig" + profile "test_mag_one_contig" + + test("-profile test_mag_one_contig") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_one_contig.nf.test.snap b/tests/mag_one_contig.nf.test.snap new file mode 100644 index 0000000..f11ba98 --- /dev/null +++ b/tests/mag_one_contig.nf.test.snap @@ -0,0 +1,33 @@ +{ + "-profile test_mag_one_contig": { + "content": [ + 3, + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "mags", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml" + ], + [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-27T12:43:36.609186" + } +} \ No newline at end of file diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf index a7897ba..5743f7d 100644 --- a/workflows/assemblysubmit.nf +++ b/workflows/assemblysubmit.nf @@ -4,17 +4,18 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { COVERM_CONTIG } from '../modules/nf-core/coverm/contig/main' -include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' -include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main' -include { REGISTERSTUDY } from '../modules/local/registerstudy/main' -include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' - -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' +include { COVERM_CONTIG } from '../modules/nf-core/coverm/contig/main' +include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' +include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' +include { ENA_WEBIN_CLI_WRAPPER as SUBMIT } from '../modules/local/ena_webin_cli_wrapper' +include { ENA_WEBIN_CLI_DOWNLOAD } from '../modules/local/ena_webin_cli_download' + +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,16 +70,13 @@ workflow ASSEMBLYSUBMIT { // Check fasta files are properly formatted FASTAVALIDATOR ( assembly_fasta, - "true" // is_metagenome flag + "true" // enables number of contigs check - ENA requires more than 1 contig for an assembly submission ) - // TODO add some logging here to track discarded assemblies validated_fastas = assembly_fasta.join(FASTAVALIDATOR.out.success_log) .map { meta, fasta, _log -> [meta, fasta] } - // TODO add human decontamination step - // For assemblies without coverage, calculate coverage with CoverM validated_fastas.filter { meta, _fasta -> meta.coverage == null } .join(reads_fastq) @@ -94,19 +92,16 @@ workflow ASSEMBLYSUBMIT { false // interleaved ) - // Calculate average coverage using map operator + // Calculate average coverage using splitCsv operator average_coverage_ch = COVERM_CONTIG.out.coverage - .map { meta, coverage_file -> - // Read the file and calculate average - def lines = coverage_file.readLines() - if (lines.size() < 2) { - return [meta, 0.0] - } - def coverages = lines[1..-1].collect { line -> - line.split('\t')[1] as Double - } + .splitCsv(sep: '\t', skip: 1) + .map { meta, row -> + [meta, row[1] as Double] + } + .groupTuple() + .map { meta, coverages -> def average = coverages.sum() / coverages.size() - return [meta, average] + [meta, average] } // Update metadata with calculated coverage @@ -126,8 +121,6 @@ workflow ASSEMBLYSUBMIT { .filter { meta, _fasta -> meta.coverage != null } .mix( assemblies_with_added_cov_ch ) - // TODO add validation step to check number of lines in CSV matches number of assemblies - assembly_metadata_csv = assemblies_with_coverage .map { meta, fasta -> def header = 'Runs,Coverage,Assembler,Version,Filepath,Sample' @@ -141,8 +134,12 @@ workflow ASSEMBLYSUBMIT { ].join(',') def content = "${header}\n${row}" - def csv_file = file("${params.outdir}/${params.mode}/${meta.id}_assembly_metadata.csv") - csv_file.parent.toFile().mkdirs() + + // Create output directory if it doesn't exist + def outDir = file("${params.outdir}/${params.mode}") + outDir.mkdirs() + + def csv_file = file("${outDir}/${meta.id}_assembly_metadata.csv") csv_file.text = content [meta, csv_file] @@ -171,8 +168,13 @@ workflow ASSEMBLYSUBMIT { study_accession_ch.first() ) - ENA_WEBIN_CLI( - assemblies_with_coverage.join(GENERATE_ASSEMBLY_MANIFEST.out.manifest) + ENA_WEBIN_CLI_DOWNLOAD ( + params.webin_cli_version + ) + + SUBMIT ( + assemblies_with_coverage.join(GENERATE_ASSEMBLY_MANIFEST.out.manifest), + ENA_WEBIN_CLI_DOWNLOAD.out.webin_cli_jar ) // diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index 4b8ca08..a17299c 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -3,22 +3,24 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { GENOME_UPLOAD } from '../modules/local/genome_upload' -include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' -include { REGISTERSTUDY } from '../modules/local/registerstudy/main' -include { RENAME_FASTA_FOR_CATPACK } from '../modules/local/rename_fasta_for_catpack' +include { GENOME_UPLOAD as CREATE_MANIFESTS } from '../modules/local/genome_upload' +include { ENA_WEBIN_CLI_WRAPPER as SUBMIT } from '../modules/local/ena_webin_cli_wrapper' +include { ENA_WEBIN_CLI_DOWNLOAD } from '../modules/local/ena_webin_cli_download' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' +include { RENAME_FASTA_FOR_CATPACK } from '../modules/local/rename_fasta_for_catpack' -include { COVERM_GENOME } from '../modules/nf-core/coverm/genome' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' +include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' +include { COVERM_GENOME } from '../modules/nf-core/coverm/genome' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-schema' -include { GENOME_EVALUATION } from '../subworkflows/local/genome_evaluation' -include { RNA_DETECTION } from '../subworkflows/local/rna_detection' -include { FASTA_CLASSIFY_CATPACK } from '../subworkflows/nf-core/fasta_classify_catpack/main' +include { GENOME_EVALUATION } from '../subworkflows/local/genome_evaluation' +include { RNA_DETECTION } from '../subworkflows/local/rna_detection' +include { FASTA_CLASSIFY_CATPACK } from '../subworkflows/nf-core/fasta_classify_catpack/main' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -74,21 +76,34 @@ workflow GENOMESUBMIT { genome_fasta = genome_fasta_and_reads.map{meta, fasta, _fq1 -> [meta, fasta]} genome_reads = genome_fasta_and_reads.map{meta, _fasta, reads -> [meta, reads]} + // --------- Check fasta files are properly formatted + FASTAVALIDATOR ( + genome_fasta, + "true" // enables number of contigs check - ENA requires more than 1 contig for a bin/MAG submission + ) + validated_fastas = genome_fasta.join(FASTAVALIDATOR.out.success_log) + .map { meta, fasta, _log -> + [meta, fasta] + } + // --------- Genome coverage calculation - genome_fasta - .branch { meta, fasta -> + validated_fastas + .branch { meta, _fasta -> genome_coverage_ref_input: meta.genome_coverage == null genome_coverage_present: true // Everything else goes here } .set { branched_coverage_results } - genome_reads.filter { meta, reads -> meta.genome_coverage == null } - .map { meta, reads -> [meta, reads] } - .set { genome_coverage_fq_input } + branched_coverage_results.genome_coverage_ref_input.join(genome_reads) + .multiMap { meta, fasta, fastq -> + genome: [ meta, fasta ] + raw_reads: [ meta, fastq ] + } + .set { coverm_input } COVERM_GENOME ( - genome_coverage_fq_input, - branched_coverage_results.genome_coverage_ref_input, + coverm_input.raw_reads, + coverm_input.genome, false, false, 'file' @@ -173,21 +188,21 @@ workflow GENOMESUBMIT { ) // build input structures for CAT_DB depending on what provided as input - def cat_db_input = (params.cat_db != null && params.cat_db != '') + def cat_db_input = params.cat_db ? channel.of( [['id': 'CAT_DB'], file(params.cat_db)] ) : channel.empty() - def cat_db_id_input = (params.cat_db_download_id != null && params.cat_db_download_id != '') + def cat_db_id_input = (!params.cat_db && params.cat_db_download_id) ? channel.of( [['id': 'CAT_DB_id'], params.cat_db_download_id] ) : channel.empty() FASTA_CLASSIFY_CATPACK ( - RENAME_FASTA_FOR_CATPACK.out.renamed_fasta, - channel.empty(), + RENAME_FASTA_FOR_CATPACK.out.renamed_fasta, // ch_bins + channel.empty(), // ch_contigs - empty because we classify bins, not contigs cat_db_input, cat_db_id_input, - false, // generate summaries - '.fasta' + false, // disable summary generation + '.fasta' // bin_suffix - the suffix of the renamed fasta files ) fasta_updated_with_taxonomy = FASTA_CLASSIFY_CATPACK.out.bat_classification @@ -205,7 +220,7 @@ workflow GENOMESUBMIT { .map { meta, fasta -> [ meta.id, - fasta, + fasta.getName(), meta.accession, meta.assembly_software, meta.binning_software, @@ -224,7 +239,8 @@ workflow GENOMESUBMIT { ].join('\t') } .collectFile( - name: "${params.outdir}/${params.mode}/genomes_metadata.csv", + name: 'genomes_metadata.csv', + storeDir: "${params.outdir}/${params.mode}", seed: [ 'genome_name', 'genome_path', @@ -247,6 +263,7 @@ workflow GENOMESUBMIT { newLine: true ) + // --------- Register study if accession not provided def study_accession_ch if (params.submission_study) { study_accession_ch = channel.of(params.submission_study) @@ -262,24 +279,43 @@ workflow GENOMESUBMIT { } } - GENOME_UPLOAD( - genome_fasta.map{meta, fasta -> fasta}.collect(), + // --------- Generate manifests + CREATE_MANIFESTS( + fasta_updated_with_stats.map{meta, fasta -> fasta}.collect(), genome_metadata_csv, - params.mode, + params.mode, // mags or bins study_accession_ch.first() ) - ch_versions = ch_versions.mix( GENOME_UPLOAD.out.versions ) - //manifests_ch = GENOME_UPLOAD.out.manifests.flatten() - // .map { manifest -> - // def prefix = manifest.name.replaceAll(/_\d+\.manifest$/, '') - // def meta = [id: prefix] - // [ meta, manifest ] - //} - //combined_ch = ch_mags.join(manifests_ch) + // All manifests were generated in one run + // Manifests should be separated into different channels using prefix as id + manifests_ch = CREATE_MANIFESTS.out.manifests.flatten() + .map { manifest -> + def prefix = params.test_upload ? + manifest.name.replaceAll(/_\d+\.manifest$/, '') : // Remove extension and hash suffix appended in test mode + manifest.name.replaceAll(/\.manifest$/, '') // Remove only extension in live mode + def meta = [id: prefix] + [ meta, manifest ] + } + // Combine fasta and manifests + ch_combined = fasta_updated_with_stats + .map { meta, fasta -> [meta.id, meta, fasta] } + .join( + manifests_ch.map { meta, manifest -> [meta.id, manifest] } // Has only [id: prefix] + ) + .map { id, full_meta, fasta, manifest -> + [full_meta, fasta, manifest] + } + + // --------- Upload data to ENA + ENA_WEBIN_CLI_DOWNLOAD ( + params.webin_cli_version + ) - //ENA_WEBIN_CLI( combined_ch ) - //ch_versions = ch_versions.mix( ENA_WEBIN_CLI.out.versions.first() ) + SUBMIT ( + ch_combined, + ENA_WEBIN_CLI_DOWNLOAD.out.webin_cli_jar + ) // // Collate and save software versions @@ -324,17 +360,17 @@ workflow GENOMESUBMIT { ) ) - //MULTIQC ( - // ch_multiqc_files.collect(), - // ch_multiqc_config.toList(), - // ch_multiqc_custom_config.toList(), - // ch_multiqc_logo.toList(), - // [], - // [] - //) + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList(), + [], + [] + ) emit: - multiqc_report = channel.empty() // MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] }