diff --git a/CHANGELOG.md b/CHANGELOG.md index f96c2c37..32276d2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,24 @@ All notable changes to this project will be documented in this file. +## 2.0.0-beta27 - 2026-03-18 + +[f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8) + +### Features + +- Add a test version of dbcan3 to compare against dbcan2 ([efb3cc2](https://github.com/WrightonLabCSU/DRAM/commit/efb3cc23a5478f85e449099ec37285138cc5f8b7)) + + dbcan3 and dbcan3-sub test versions, will run both if run_dbcan3 + option is present. + +- Switch hmmsearch to using PyHMMER search ([7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8)) + + PyHMMER has better parrallelism support, directly calling + the lower level C bindings for HMMER and rewriting how + it parallelizes. This means that when you had cpus=4 arg, it can + 1/3 of the walltime with the exact same result. + ## 2.0.0-beta26 - 2026-03-09 [605d4f5](https://github.com/WrightonLabCSU/DRAM/commit/605d4f5d619d9f373352c8f400128066edcf58ef)...[91edea7](https://github.com/WrightonLabCSU/DRAM/commit/91edea7e6974be47da036f0f8af247d3d033326a) diff --git a/bin/hmm_search.py b/bin/hmm_search.py new file mode 100755 index 00000000..ed096a37 --- /dev/null +++ b/bin/hmm_search.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +import time +import pyhmmer +import click +from pathlib import Path + +alphabet = pyhmmer.easel.Alphabet.amino() + + +@click.command() +@click.option( + "--hmm", + type=str, + help="Path glob to the HMM db.", +) +@click.option( + "--input_file", + type=click.Path(exists=True), + help="Path to the input fasta to search against", +) +@click.option("--e_value", type=float, help="e value cutoff for filtering") +@click.option( + "--output_file", + type=click.Path(), + help="Path to output file", +) +@click.option("--cpus", type=int, help="number of cpu core to run HMMER with") +def main(hmm, input_file, e_value, output_file, cpus): + t1 = time.time() + + hmm = Path(hmm) + + hmm_paths = hmm.parent.glob(hmm.name) + + hmms = [] + for path in hmm_paths: + with pyhmmer.plan7.HMMFile(path) as hmm_file: + hmms.extend(hmm_file) + + print(hmms) + + with open(output_file, "wb") as out_fh: + with pyhmmer.easel.SequenceFile( + input_file, digital=True, alphabet=alphabet + ) as sf: + seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet) + seqs.extend(sf) + first = True + for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value): + hits.write(out_fh, format="domains", header=first) + first = False + # total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15)) + print(f"pyhmmer search completed in {time.time() - t1:.3} seconds") + + +if __name__ == "__main__": + main() diff --git a/modules/local/annotate/add_sql_descriptions.nf b/modules/local/annotate/add_sql_descriptions.nf index 6e00379f..88ad590e 100644 --- a/modules/local/annotate/add_sql_descriptions.nf +++ b/modules/local/annotate/add_sql_descriptions.nf @@ -4,7 +4,7 @@ process ADD_SQL_DESCRIPTIONS { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" input: tuple val(input_fasta), path(hits_file) diff --git a/modules/local/annotate/combine_annotations.nf b/modules/local/annotate/combine_annotations.nf index a0c35e90..dcc039b9 100644 --- a/modules/local/annotate/combine_annotations.nf +++ b/modules/local/annotate/combine_annotations.nf @@ -4,7 +4,7 @@ process COMBINE_ANNOTATIONS { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" input: path(fastas, stageAs: "annotations/*" ) diff --git a/modules/local/annotate/environment.yml b/modules/local/annotate/environment.yml index e81f7bf4..2f09b18e 100644 --- a/modules/local/annotate/environment.yml +++ b/modules/local/annotate/environment.yml @@ -10,3 +10,4 @@ dependencies: - scikit-bio=0.7.1 - scipy<2 - click<9.0 + - pyhmmer diff --git a/modules/local/annotate/gene_locs.nf b/modules/local/annotate/gene_locs.nf index 97c7e619..2aeb7f08 100644 --- a/modules/local/annotate/gene_locs.nf +++ b/modules/local/annotate/gene_locs.nf @@ -4,7 +4,7 @@ process GENE_LOCS { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" tag { input_fasta } diff --git a/modules/local/annotate/hmmsearch.nf b/modules/local/annotate/hmmsearch.nf index ce3cb3dc..52c9dbd9 100644 --- a/modules/local/annotate/hmmsearch.nf +++ b/modules/local/annotate/hmmsearch.nf @@ -4,7 +4,7 @@ process HMM_SEARCH { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" tag { input_fasta } @@ -24,12 +24,12 @@ process HMM_SEARCH { def ec_flag = ec_from_info ? "--ec_from_info" : "" """ - hmmsearch \\ - -E ${e_value} \\ - --domtblout ${input_fasta}_hmmsearch.out \\ - --cpu ${task.cpus} \\ - ${database_loc}/*.hmm \\ - ${fasta} > /dev/null + hmm_search.py \\ + --hmm ${database_loc}/*.hmm \\ + --input_file ${fasta} \\ + --e_value ${e_value} \\ + --output_file ${input_fasta}_hmmsearch.out \\ + --cpus ${task.cpus} hmm_parser.py \\ --hmm_domtbl ${input_fasta}_hmmsearch.out \\ diff --git a/modules/local/annotate/merge_annotations.nf b/modules/local/annotate/merge_annotations.nf index cce32b23..e55272cb 100644 --- a/modules/local/annotate/merge_annotations.nf +++ b/modules/local/annotate/merge_annotations.nf @@ -4,7 +4,7 @@ process MERGE_ANNOTATIONS { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" input: path( ch_annotations, stageAs: "annotations/*" ) diff --git a/modules/local/annotate/mmseqs_index.nf b/modules/local/annotate/mmseqs_index.nf index c33d12b1..73c0e04e 100644 --- a/modules/local/annotate/mmseqs_index.nf +++ b/modules/local/annotate/mmseqs_index.nf @@ -4,7 +4,7 @@ process MMSEQS_INDEX{ errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" tag { input_fasta } diff --git a/modules/local/annotate/mmseqs_search.nf b/modules/local/annotate/mmseqs_search.nf index 279de72b..5407e3fe 100644 --- a/modules/local/annotate/mmseqs_search.nf +++ b/modules/local/annotate/mmseqs_search.nf @@ -4,7 +4,7 @@ process MMSEQS_SEARCH { errorStrategy 'finish' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c" + container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9" tag { input_fasta } diff --git a/nextflow.config b/nextflow.config index 7f7821ff..3dab9ef2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,6 +48,7 @@ params { use_kegg = false use_kofam = false use_dbcan = false + use_dbcan3 = false use_camper = false use_fegenie = false use_methyl = false @@ -115,6 +116,8 @@ params { dbcan_db = "${launchDir}/databases/dbcan/" dbcan_fam_activities = "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv" dbcan_subfam_activities = "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv" + dbcan3_db = "${launchDir}/databases/dbcan3" + dbcan3_sub_db = "${launchDir}/databases/dbcan3_sub" // vogdb vog_db = "${launchDir}/databases/vogdb/" vog_list = "${launchDir}/databases/vogdb/vog_annotations_latest.tsv.gz" @@ -172,7 +175,7 @@ params { // Not the limit to the total resources available to the pipeline // Up to queue_size processes can run in parallel, of various sizes tiny_cpus_limit = 1 - small_cpus_limit = 2 + small_cpus_limit = 4 medium_cpus_limit = 6 big_cpus_limit = 12 huge_cpus_limit = 24 @@ -478,7 +481,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'master' nextflowVersion = '!>=24' - version = '2.0.0-beta26' + version = '2.0.0-beta27' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index ce03e4ee..44854bf1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -146,6 +146,10 @@ "type": "boolean", "description": "Use the DBCan database for annotation." }, + "use_dbcan3": { + "type": "boolean", + "description": "Use the experimental DBCan3 databases for annotation." + }, "use_fegenie": { "type": "boolean", "description": "Use the FeGenie database for annotation." @@ -376,6 +380,16 @@ "default": "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv", "hidden": true }, + "dbcan3_db": { + "type": "string", + "default": "${launchDir}/databases/dbcan3/", + "hidden": true + }, + "dbcan3_sub_db": { + "type": "string", + "default": "${launchDir}/databases/dbcan3_sub/", + "hidden": true + }, "vog_db": { "type": "string", "default": "${launchDir}/databases/vog/", diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index 7efc4440..3316fb82 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -18,6 +18,7 @@ workflow ANNOTATE { use_kegg use_kofam use_dbcan + use_dbcan3 use_camper use_fegenie use_methyl @@ -107,6 +108,7 @@ workflow ANNOTATE { use_kegg, use_kofam, use_dbcan, + use_dbcan3, use_camper, use_fegenie, use_methyl, diff --git a/subworkflows/local/db_search.nf b/subworkflows/local/db_search.nf index a08673c4..a4edfec6 100644 --- a/subworkflows/local/db_search.nf +++ b/subworkflows/local/db_search.nf @@ -32,6 +32,8 @@ include { ADD_SQL_DESCRIPTIONS as SQL_DBCAN } from "../../modules/lo include { HMM_SEARCH as HMM_SEARCH_KOFAM } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_DBCAN } from "../../modules/local/annotate/hmmsearch.nf" +include { HMM_SEARCH as HMM_SEARCH_DBCAN3 } from "../../modules/local/annotate/hmmsearch.nf" +include { HMM_SEARCH as HMM_SEARCH_DBCAN3_SUB } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_VOG } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_CAMPER } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_CANTHYD } from "../../modules/local/annotate/hmmsearch.nf" @@ -53,6 +55,7 @@ workflow DB_SEARCH { use_kegg use_kofam use_dbcan + use_dbcan3 use_camper use_fegenie use_methyl @@ -70,6 +73,7 @@ workflow DB_SEARCH { use_kegg, use_kofam, use_dbcan, + use_dbcan3, use_camper, use_fegenie, use_methyl, @@ -94,6 +98,8 @@ workflow DB_SEARCH { kegg_name = "kegg" dbcan_name = "dbcan" + dbcan3_name = "dbcan3" + dbcan3_sub_name = "dbcan3_sub" kofam_name = "kofam" merops_name = "merops" viral_name = "viral" @@ -170,6 +176,32 @@ workflow DB_SEARCH { ch_dbcan_formatted = SQL_DBCAN.out.sql_formatted_hits formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan_formatted) } + // dbCAN3 annotation + if (use_dbcan3) { + ch_combined_proteins_locs = ch_called_proteins.join(ch_gene_locs) + HMM_SEARCH_DBCAN3 ( + ch_combined_proteins_locs, + params.dbcan_e_value, + DB_CHANNEL_SETUP.out.ch_dbcan3_db, + default_sheet, + false, + dbcan3_name + ) + ch_dbcan3_formatted = HMM_SEARCH_DBCAN3.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_formatted) + + + HMM_SEARCH_DBCAN3_SUB ( + ch_combined_proteins_locs, + params.dbcan_e_value, + DB_CHANNEL_SETUP.out.ch_dbcan3_sub_db, + default_sheet, + false, + dbcan3_sub_name + ) + ch_dbcan3_sub_formatted = HMM_SEARCH_DBCAN3_SUB.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_sub_formatted) + } // CAMPER annotation if (use_camper) { // HMM @@ -329,6 +361,7 @@ workflow DB_CHANNEL_SETUP { use_kegg use_kofam use_dbcan + use_dbcan3 use_camper use_fegenie use_methyl @@ -377,6 +410,11 @@ workflow DB_CHANNEL_SETUP { ch_dbcan_db = file(params.dbcan_db).exists() ? file(params.dbcan_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN database file not found at ${params.dbcan_db}") } + if (use_dbcan3) { + ch_dbcan3_db = file(params.dbcan3_db).exists() ? file(params.dbcan3_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN3 database file not found at ${params.dbcan3_db}") + ch_dbcan3_sub_db = file(params.dbcan3_sub_db).exists() ? file(params.dbcan3_sub_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN3 sub database file not found at ${params.dbcan3_sub_db}") + } + if (use_camper) { ch_camper_hmm_db = file(params.camper_hmm_db).exists() ? file(params.camper_hmm_db) : error("Error: If using --annotate, you must supply prebuilt databases. CAMPER HMM database file not found at ${params.camper_hmm_db}") ch_camper_mmseqs_db = file(params.camper_mmseqs_db).exists() ? file(params.camper_mmseqs_db) : error("Error: If using --annotate, you must supply prebuilt databases. CAMPER MMseqs2 database file not found at ${params.camper_mmseqs_db}") @@ -440,6 +478,8 @@ workflow DB_CHANNEL_SETUP { ch_kegg_db ch_kofam_db ch_dbcan_db + ch_dbcan3_db + ch_dbcan3_sub_db ch_camper_hmm_db ch_camper_mmseqs_db ch_camper_mmseqs_list diff --git a/workflows/dram.nf b/workflows/dram.nf index 5154fadb..53481ba2 100644 --- a/workflows/dram.nf +++ b/workflows/dram.nf @@ -90,6 +90,7 @@ workflow DRAM { use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all) use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all) use_dbcan = getDBFlag(anno_dbs, 'dbcan', value_for_all) + use_dbcan3 = getDBFlag(anno_dbs, 'dbcan3', value_for_all) use_camper = getDBFlag(anno_dbs, 'camper', value_for_all) use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all) use_methyl = getDBFlag(anno_dbs, 'methyl', value_for_all) @@ -230,6 +231,7 @@ workflow DRAM { use_kegg, use_kofam, use_dbcan, + use_dbcan3, use_camper, use_fegenie, use_methyl,