Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@

All notable changes to this project will be documented in this file.

## 2.0.0-beta27 - 2026-03-18

[f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8)

### Features

- Add a test version of dbcan3 to compare against dbcan2 ([efb3cc2](https://github.com/WrightonLabCSU/DRAM/commit/efb3cc23a5478f85e449099ec37285138cc5f8b7))

dbcan3 and dbcan3-sub test versions, will run both if run_dbcan3
option is present.

- Switch hmmsearch to using PyHMMER search ([7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8))

PyHMMER has better parrallelism support, directly calling
the lower level C bindings for HMMER and rewriting how
it parallelizes. This means that when you had cpus=4 arg, it can
1/3 of the walltime with the exact same result.

## 2.0.0-beta26 - 2026-03-09

[605d4f5](https://github.com/WrightonLabCSU/DRAM/commit/605d4f5d619d9f373352c8f400128066edcf58ef)...[91edea7](https://github.com/WrightonLabCSU/DRAM/commit/91edea7e6974be47da036f0f8af247d3d033326a)
Expand Down
57 changes: 57 additions & 0 deletions bin/hmm_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python
import time
import pyhmmer
import click
from pathlib import Path

alphabet = pyhmmer.easel.Alphabet.amino()


@click.command()
@click.option(
"--hmm",
type=str,
help="Path glob to the HMM db.",
)
@click.option(
"--input_file",
type=click.Path(exists=True),
help="Path to the input fasta to search against",
)
@click.option("--e_value", type=float, help="e value cutoff for filtering")
@click.option(
"--output_file",
type=click.Path(),
help="Path to output file",
)
@click.option("--cpus", type=int, help="number of cpu core to run HMMER with")
def main(hmm, input_file, e_value, output_file, cpus):
t1 = time.time()

hmm = Path(hmm)

hmm_paths = hmm.parent.glob(hmm.name)

hmms = []
for path in hmm_paths:
with pyhmmer.plan7.HMMFile(path) as hmm_file:
hmms.extend(hmm_file)

print(hmms)

with open(output_file, "wb") as out_fh:
with pyhmmer.easel.SequenceFile(
input_file, digital=True, alphabet=alphabet
) as sf:
seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet)
seqs.extend(sf)
first = True
for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value):
hits.write(out_fh, format="domains", header=first)
first = False
# total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15))
print(f"pyhmmer search completed in {time.time() - t1:.3} seconds")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion modules/local/annotate/add_sql_descriptions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process ADD_SQL_DESCRIPTIONS {
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

input:
tuple val(input_fasta), path(hits_file)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/annotate/combine_annotations.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process COMBINE_ANNOTATIONS {
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

input:
path(fastas, stageAs: "annotations/*" )
Expand Down
1 change: 1 addition & 0 deletions modules/local/annotate/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ dependencies:
- scikit-bio=0.7.1
- scipy<2
- click<9.0
- pyhmmer
2 changes: 1 addition & 1 deletion modules/local/annotate/gene_locs.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process GENE_LOCS {
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

tag { input_fasta }

Expand Down
14 changes: 7 additions & 7 deletions modules/local/annotate/hmmsearch.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process HMM_SEARCH {
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

tag { input_fasta }

Expand All @@ -24,12 +24,12 @@ process HMM_SEARCH {
def ec_flag = ec_from_info ? "--ec_from_info" : ""

"""
hmmsearch \\
-E ${e_value} \\
--domtblout ${input_fasta}_hmmsearch.out \\
--cpu ${task.cpus} \\
${database_loc}/*.hmm \\
${fasta} > /dev/null
hmm_search.py \\
--hmm ${database_loc}/*.hmm \\
--input_file ${fasta} \\
--e_value ${e_value} \\
--output_file ${input_fasta}_hmmsearch.out \\
--cpus ${task.cpus}

hmm_parser.py \\
--hmm_domtbl ${input_fasta}_hmmsearch.out \\
Expand Down
2 changes: 1 addition & 1 deletion modules/local/annotate/merge_annotations.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process MERGE_ANNOTATIONS {
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

input:
path( ch_annotations, stageAs: "annotations/*" )
Expand Down
2 changes: 1 addition & 1 deletion modules/local/annotate/mmseqs_index.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process MMSEQS_INDEX{
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

tag { input_fasta }

Expand Down
2 changes: 1 addition & 1 deletion modules/local/annotate/mmseqs_search.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process MMSEQS_SEARCH {
errorStrategy 'finish'

conda "${moduleDir}/environment.yml"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"

tag { input_fasta }

Expand Down
7 changes: 5 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ params {
use_kegg = false
use_kofam = false
use_dbcan = false
use_dbcan3 = false
use_camper = false
use_fegenie = false
use_methyl = false
Expand Down Expand Up @@ -115,6 +116,8 @@ params {
dbcan_db = "${launchDir}/databases/dbcan/"
dbcan_fam_activities = "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv"
dbcan_subfam_activities = "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv"
dbcan3_db = "${launchDir}/databases/dbcan3"
dbcan3_sub_db = "${launchDir}/databases/dbcan3_sub"
// vogdb
vog_db = "${launchDir}/databases/vogdb/"
vog_list = "${launchDir}/databases/vogdb/vog_annotations_latest.tsv.gz"
Expand Down Expand Up @@ -172,7 +175,7 @@ params {
// Not the limit to the total resources available to the pipeline
// Up to queue_size processes can run in parallel, of various sizes
tiny_cpus_limit = 1
small_cpus_limit = 2
small_cpus_limit = 4
medium_cpus_limit = 6
big_cpus_limit = 12
huge_cpus_limit = 24
Expand Down Expand Up @@ -478,7 +481,7 @@ manifest {
mainScript = 'main.nf'
defaultBranch = 'master'
nextflowVersion = '!>=24'
version = '2.0.0-beta26'
version = '2.0.0-beta27'
doi = ''
}

Expand Down
14 changes: 14 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@
"type": "boolean",
"description": "Use the DBCan database for annotation."
},
"use_dbcan3": {
"type": "boolean",
"description": "Use the experimental DBCan3 databases for annotation."
},
"use_fegenie": {
"type": "boolean",
"description": "Use the FeGenie database for annotation."
Expand Down Expand Up @@ -376,6 +380,16 @@
"default": "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv",
"hidden": true
},
"dbcan3_db": {
"type": "string",
"default": "${launchDir}/databases/dbcan3/",
"hidden": true
},
"dbcan3_sub_db": {
"type": "string",
"default": "${launchDir}/databases/dbcan3_sub/",
"hidden": true
},
"vog_db": {
"type": "string",
"default": "${launchDir}/databases/vog/",
Expand Down
2 changes: 2 additions & 0 deletions subworkflows/local/annotate.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ workflow ANNOTATE {
use_kegg
use_kofam
use_dbcan
use_dbcan3
use_camper
use_fegenie
use_methyl
Expand Down Expand Up @@ -107,6 +108,7 @@ workflow ANNOTATE {
use_kegg,
use_kofam,
use_dbcan,
use_dbcan3,
use_camper,
use_fegenie,
use_methyl,
Expand Down
40 changes: 40 additions & 0 deletions subworkflows/local/db_search.nf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ include { ADD_SQL_DESCRIPTIONS as SQL_DBCAN } from "../../modules/lo

include { HMM_SEARCH as HMM_SEARCH_KOFAM } from "../../modules/local/annotate/hmmsearch.nf"
include { HMM_SEARCH as HMM_SEARCH_DBCAN } from "../../modules/local/annotate/hmmsearch.nf"
include { HMM_SEARCH as HMM_SEARCH_DBCAN3 } from "../../modules/local/annotate/hmmsearch.nf"
include { HMM_SEARCH as HMM_SEARCH_DBCAN3_SUB } from "../../modules/local/annotate/hmmsearch.nf"
include { HMM_SEARCH as HMM_SEARCH_VOG } from "../../modules/local/annotate/hmmsearch.nf"
include { HMM_SEARCH as HMM_SEARCH_CAMPER } from "../../modules/local/annotate/hmmsearch.nf"
include { HMM_SEARCH as HMM_SEARCH_CANTHYD } from "../../modules/local/annotate/hmmsearch.nf"
Expand All @@ -53,6 +55,7 @@ workflow DB_SEARCH {
use_kegg
use_kofam
use_dbcan
use_dbcan3
use_camper
use_fegenie
use_methyl
Expand All @@ -70,6 +73,7 @@ workflow DB_SEARCH {
use_kegg,
use_kofam,
use_dbcan,
use_dbcan3,
use_camper,
use_fegenie,
use_methyl,
Expand All @@ -94,6 +98,8 @@ workflow DB_SEARCH {

kegg_name = "kegg"
dbcan_name = "dbcan"
dbcan3_name = "dbcan3"
dbcan3_sub_name = "dbcan3_sub"
kofam_name = "kofam"
merops_name = "merops"
viral_name = "viral"
Expand Down Expand Up @@ -170,6 +176,32 @@ workflow DB_SEARCH {
ch_dbcan_formatted = SQL_DBCAN.out.sql_formatted_hits
formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan_formatted)
}
// dbCAN3 annotation
if (use_dbcan3) {
ch_combined_proteins_locs = ch_called_proteins.join(ch_gene_locs)
HMM_SEARCH_DBCAN3 (
ch_combined_proteins_locs,
params.dbcan_e_value,
DB_CHANNEL_SETUP.out.ch_dbcan3_db,
default_sheet,
false,
dbcan3_name
)
ch_dbcan3_formatted = HMM_SEARCH_DBCAN3.out.formatted_hits
formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_formatted)


HMM_SEARCH_DBCAN3_SUB (
ch_combined_proteins_locs,
params.dbcan_e_value,
DB_CHANNEL_SETUP.out.ch_dbcan3_sub_db,
default_sheet,
false,
dbcan3_sub_name
)
ch_dbcan3_sub_formatted = HMM_SEARCH_DBCAN3_SUB.out.formatted_hits
formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_sub_formatted)
}
// CAMPER annotation
if (use_camper) {
// HMM
Expand Down Expand Up @@ -329,6 +361,7 @@ workflow DB_CHANNEL_SETUP {
use_kegg
use_kofam
use_dbcan
use_dbcan3
use_camper
use_fegenie
use_methyl
Expand Down Expand Up @@ -377,6 +410,11 @@ workflow DB_CHANNEL_SETUP {
ch_dbcan_db = file(params.dbcan_db).exists() ? file(params.dbcan_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN database file not found at ${params.dbcan_db}")
}

if (use_dbcan3) {
ch_dbcan3_db = file(params.dbcan3_db).exists() ? file(params.dbcan3_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN3 database file not found at ${params.dbcan3_db}")
ch_dbcan3_sub_db = file(params.dbcan3_sub_db).exists() ? file(params.dbcan3_sub_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN3 sub database file not found at ${params.dbcan3_sub_db}")
}

if (use_camper) {
ch_camper_hmm_db = file(params.camper_hmm_db).exists() ? file(params.camper_hmm_db) : error("Error: If using --annotate, you must supply prebuilt databases. CAMPER HMM database file not found at ${params.camper_hmm_db}")
ch_camper_mmseqs_db = file(params.camper_mmseqs_db).exists() ? file(params.camper_mmseqs_db) : error("Error: If using --annotate, you must supply prebuilt databases. CAMPER MMseqs2 database file not found at ${params.camper_mmseqs_db}")
Expand Down Expand Up @@ -440,6 +478,8 @@ workflow DB_CHANNEL_SETUP {
ch_kegg_db
ch_kofam_db
ch_dbcan_db
ch_dbcan3_db
ch_dbcan3_sub_db
ch_camper_hmm_db
ch_camper_mmseqs_db
ch_camper_mmseqs_list
Expand Down
2 changes: 2 additions & 0 deletions workflows/dram.nf
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ workflow DRAM {
use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all)
use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all)
use_dbcan = getDBFlag(anno_dbs, 'dbcan', value_for_all)
use_dbcan3 = getDBFlag(anno_dbs, 'dbcan3', value_for_all)
use_camper = getDBFlag(anno_dbs, 'camper', value_for_all)
use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all)
use_methyl = getDBFlag(anno_dbs, 'methyl', value_for_all)
Expand Down Expand Up @@ -230,6 +231,7 @@ workflow DRAM {
use_kegg,
use_kofam,
use_dbcan,
use_dbcan3,
use_camper,
use_fegenie,
use_methyl,
Expand Down
Loading