WrightonLabCSU · madeline-scyphers · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,24 @@
 
 All notable changes to this project will be documented in this file.
 
+## 2.0.0-beta27 - 2026-03-18
+
+[f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8)
+
+### Features
+
+- Add a test version of dbcan3 to compare against dbcan2 ([efb3cc2](https://github.com/WrightonLabCSU/DRAM/commit/efb3cc23a5478f85e449099ec37285138cc5f8b7))
+
+  dbcan3 and dbcan3-sub test versions, will run both if run_dbcan3
+  option is present.
+
+- Switch hmmsearch to using PyHMMER search ([7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8))
+
+  PyHMMER has better parrallelism support, directly calling
+  the lower level C bindings for HMMER and rewriting how
+  it parallelizes. This means that when you had cpus=4 arg, it can
+  1/3 of the walltime with the exact same result.
+
 ## 2.0.0-beta26 - 2026-03-09
 
 [605d4f5](https://github.com/WrightonLabCSU/DRAM/commit/605d4f5d619d9f373352c8f400128066edcf58ef)...[91edea7](https://github.com/WrightonLabCSU/DRAM/commit/91edea7e6974be47da036f0f8af247d3d033326a)

diff --git a/bin/hmm_search.py b/bin/hmm_search.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+import time
+import pyhmmer
+import click
+from pathlib import Path
+
+alphabet = pyhmmer.easel.Alphabet.amino()
+
+
+@click.command()
+@click.option(
+    "--hmm",
+    type=str,
+    help="Path glob to the HMM db.",
+)
+@click.option(
+    "--input_file",
+    type=click.Path(exists=True),
+    help="Path to the input fasta to search against",
+)
+@click.option("--e_value", type=float, help="e value cutoff for filtering")
+@click.option(
+    "--output_file",
+    type=click.Path(),
+    help="Path to output file",
+)
+@click.option("--cpus", type=int, help="number of cpu core to run HMMER with")
+def main(hmm, input_file, e_value, output_file, cpus):
+    t1 = time.time()
+
+    hmm = Path(hmm)
+
+    hmm_paths = hmm.parent.glob(hmm.name)
+
+    hmms = []
+    for path in hmm_paths:
+        with pyhmmer.plan7.HMMFile(path) as hmm_file:
+            hmms.extend(hmm_file)
+
+    print(hmms)
+
+    with open(output_file, "wb") as out_fh:
+        with pyhmmer.easel.SequenceFile(
+            input_file, digital=True, alphabet=alphabet
+        ) as sf:
+            seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet)
+            seqs.extend(sf)
+            first = True
+            for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value):
+                hits.write(out_fh, format="domains", header=first)
+                first = False
+            # total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15))
+            print(f"pyhmmer search completed in {time.time() - t1:.3} seconds")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/local/annotate/add_sql_descriptions.nf b/modules/local/annotate/add_sql_descriptions.nf
@@ -4,7 +4,7 @@ process ADD_SQL_DESCRIPTIONS {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     input:
     tuple val(input_fasta), path(hits_file)

diff --git a/modules/local/annotate/combine_annotations.nf b/modules/local/annotate/combine_annotations.nf
@@ -4,7 +4,7 @@ process COMBINE_ANNOTATIONS {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     input:
     path(fastas, stageAs: "annotations/*" )

diff --git a/modules/local/annotate/environment.yml b/modules/local/annotate/environment.yml
@@ -10,3 +10,4 @@ dependencies:
   - scikit-bio=0.7.1
   - scipy<2
   - click<9.0
+  - pyhmmer
diff --git a/modules/local/annotate/gene_locs.nf b/modules/local/annotate/gene_locs.nf
@@ -4,7 +4,7 @@ process GENE_LOCS {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     tag { input_fasta }
 

diff --git a/modules/local/annotate/hmmsearch.nf b/modules/local/annotate/hmmsearch.nf
@@ -4,7 +4,7 @@ process HMM_SEARCH {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     tag { input_fasta }
 
@@ -24,12 +24,12 @@ process HMM_SEARCH {
     def ec_flag = ec_from_info ? "--ec_from_info" : ""
 
     """
-    hmmsearch \\
-    -E ${e_value} \\
-    --domtblout ${input_fasta}_hmmsearch.out \\
-    --cpu ${task.cpus} \\
-    ${database_loc}/*.hmm \\
-    ${fasta} > /dev/null
+    hmm_search.py \\
+        --hmm  ${database_loc}/*.hmm \\
+        --input_file ${fasta} \\
+        --e_value ${e_value} \\
+        --output_file ${input_fasta}_hmmsearch.out \\
+        --cpus ${task.cpus}
 
     hmm_parser.py \\
         --hmm_domtbl ${input_fasta}_hmmsearch.out \\

diff --git a/modules/local/annotate/merge_annotations.nf b/modules/local/annotate/merge_annotations.nf
@@ -4,7 +4,7 @@ process MERGE_ANNOTATIONS {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     input:
     path( ch_annotations, stageAs: "annotations/*" )

diff --git a/modules/local/annotate/mmseqs_index.nf b/modules/local/annotate/mmseqs_index.nf
@@ -4,7 +4,7 @@ process MMSEQS_INDEX{
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     tag { input_fasta }
 

diff --git a/modules/local/annotate/mmseqs_search.nf b/modules/local/annotate/mmseqs_search.nf
@@ -4,7 +4,7 @@ process MMSEQS_SEARCH {
     errorStrategy 'finish'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
+    container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
 
     tag { input_fasta }
 

diff --git a/nextflow.config b/nextflow.config
@@ -48,6 +48,7 @@ params {
     use_kegg = false
     use_kofam = false
     use_dbcan = false
+    use_dbcan3 = false
     use_camper = false
     use_fegenie = false
     use_methyl = false
@@ -115,6 +116,8 @@ params {
     dbcan_db = "${launchDir}/databases/dbcan/"
     dbcan_fam_activities = "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv"
     dbcan_subfam_activities = "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv"
+    dbcan3_db = "${launchDir}/databases/dbcan3"
+    dbcan3_sub_db = "${launchDir}/databases/dbcan3_sub"
     // vogdb
     vog_db = "${launchDir}/databases/vogdb/"
     vog_list = "${launchDir}/databases/vogdb/vog_annotations_latest.tsv.gz"
@@ -172,7 +175,7 @@ params {
     // Not the limit to the total resources available to the pipeline
     // Up to queue_size processes can run in parallel, of various sizes
     tiny_cpus_limit = 1
-    small_cpus_limit = 2
+    small_cpus_limit = 4
     medium_cpus_limit = 6
     big_cpus_limit = 12
     huge_cpus_limit = 24
@@ -478,7 +481,7 @@ manifest {
     mainScript      = 'main.nf'
     defaultBranch   = 'master'
     nextflowVersion = '!>=24'
-    version         = '2.0.0-beta26'
+    version         = '2.0.0-beta27'
     doi             = ''
 }
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -146,6 +146,10 @@
                     "type": "boolean",
                     "description": "Use the DBCan database for annotation."
                 },
+                "use_dbcan3": {
+                    "type": "boolean",
+                    "description": "Use the experimental DBCan3 databases for annotation."
+                },
                 "use_fegenie": {
                     "type": "boolean",
                     "description": "Use the FeGenie database for annotation."
@@ -376,6 +380,16 @@
                     "default": "${launchDir}/databases/dbcan/dbcan.fam-activities.tsv",
                     "hidden": true
                 },
+                "dbcan3_db": {
+                    "type": "string",
+                    "default": "${launchDir}/databases/dbcan3/",
+                    "hidden": true
+                },
+                "dbcan3_sub_db": {
+                    "type": "string",
+                    "default": "${launchDir}/databases/dbcan3_sub/",
+                    "hidden": true
+                },
                 "vog_db": {
                     "type": "string",
                     "default": "${launchDir}/databases/vog/",

diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf
@@ -18,6 +18,7 @@ workflow ANNOTATE {
     use_kegg
     use_kofam
     use_dbcan
+    use_dbcan3
     use_camper
     use_fegenie
     use_methyl
@@ -107,6 +108,7 @@ workflow ANNOTATE {
             use_kegg,
             use_kofam,
             use_dbcan,
+            use_dbcan3,
             use_camper,
             use_fegenie,
             use_methyl,

diff --git a/subworkflows/local/db_search.nf b/subworkflows/local/db_search.nf
@@ -32,6 +32,8 @@ include { ADD_SQL_DESCRIPTIONS as SQL_DBCAN             } from "../../modules/lo
 
 include { HMM_SEARCH as HMM_SEARCH_KOFAM                } from "../../modules/local/annotate/hmmsearch.nf"
 include { HMM_SEARCH as HMM_SEARCH_DBCAN                } from "../../modules/local/annotate/hmmsearch.nf"
+include { HMM_SEARCH as HMM_SEARCH_DBCAN3                } from "../../modules/local/annotate/hmmsearch.nf"
+include { HMM_SEARCH as HMM_SEARCH_DBCAN3_SUB                } from "../../modules/local/annotate/hmmsearch.nf"
 include { HMM_SEARCH as HMM_SEARCH_VOG                  } from "../../modules/local/annotate/hmmsearch.nf"
 include { HMM_SEARCH as HMM_SEARCH_CAMPER               } from "../../modules/local/annotate/hmmsearch.nf"
 include { HMM_SEARCH as HMM_SEARCH_CANTHYD              } from "../../modules/local/annotate/hmmsearch.nf"
@@ -53,6 +55,7 @@ workflow DB_SEARCH {
     use_kegg
     use_kofam
     use_dbcan
+    use_dbcan3
     use_camper
     use_fegenie
     use_methyl
@@ -70,6 +73,7 @@ workflow DB_SEARCH {
         use_kegg,
         use_kofam,
         use_dbcan,
+        use_dbcan3,
         use_camper,
         use_fegenie,
         use_methyl,
@@ -94,6 +98,8 @@ workflow DB_SEARCH {
 
     kegg_name = "kegg"
     dbcan_name = "dbcan"
+    dbcan3_name = "dbcan3"
+    dbcan3_sub_name = "dbcan3_sub"
     kofam_name = "kofam"
     merops_name = "merops"
     viral_name = "viral"
@@ -170,6 +176,32 @@ workflow DB_SEARCH {
         ch_dbcan_formatted = SQL_DBCAN.out.sql_formatted_hits
         formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan_formatted)
     }
+    // dbCAN3 annotation
+    if  (use_dbcan3) {
+        ch_combined_proteins_locs = ch_called_proteins.join(ch_gene_locs)
+        HMM_SEARCH_DBCAN3 (
+            ch_combined_proteins_locs,
+            params.dbcan_e_value,
+            DB_CHANNEL_SETUP.out.ch_dbcan3_db,
+            default_sheet,
+            false,
+            dbcan3_name
+            )
+        ch_dbcan3_formatted = HMM_SEARCH_DBCAN3.out.formatted_hits
+        formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_formatted)
+
+
+        HMM_SEARCH_DBCAN3_SUB (
+            ch_combined_proteins_locs,
+            params.dbcan_e_value,
+            DB_CHANNEL_SETUP.out.ch_dbcan3_sub_db,
+            default_sheet,
+            false,
+            dbcan3_sub_name
+            )
+        ch_dbcan3_sub_formatted = HMM_SEARCH_DBCAN3_SUB.out.formatted_hits
+        formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_sub_formatted)
+    }
     // CAMPER annotation
     if (use_camper) {
         // HMM
@@ -329,6 +361,7 @@ workflow DB_CHANNEL_SETUP {
     use_kegg
     use_kofam
     use_dbcan
+    use_dbcan3
     use_camper
     use_fegenie
     use_methyl
@@ -377,6 +410,11 @@ workflow DB_CHANNEL_SETUP {
         ch_dbcan_db = file(params.dbcan_db).exists() ? file(params.dbcan_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN database file not found at ${params.dbcan_db}")
     }
 
+    if (use_dbcan3) {
+        ch_dbcan3_db = file(params.dbcan3_db).exists() ? file(params.dbcan3_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN3 database file not found at ${params.dbcan3_db}")
+        ch_dbcan3_sub_db = file(params.dbcan3_sub_db).exists() ? file(params.dbcan3_sub_db) : error("Error: If using --annotate, you must supply prebuilt databases. DBCAN3 sub database file not found at ${params.dbcan3_sub_db}")
+    }
+
     if (use_camper) {
         ch_camper_hmm_db = file(params.camper_hmm_db).exists() ? file(params.camper_hmm_db) : error("Error: If using --annotate, you must supply prebuilt databases. CAMPER HMM database file not found at ${params.camper_hmm_db}")
         ch_camper_mmseqs_db = file(params.camper_mmseqs_db).exists() ? file(params.camper_mmseqs_db) : error("Error: If using --annotate, you must supply prebuilt databases. CAMPER MMseqs2 database file not found at ${params.camper_mmseqs_db}")
@@ -440,6 +478,8 @@ workflow DB_CHANNEL_SETUP {
     ch_kegg_db
     ch_kofam_db
     ch_dbcan_db
+    ch_dbcan3_db
+    ch_dbcan3_sub_db
     ch_camper_hmm_db
     ch_camper_mmseqs_db
     ch_camper_mmseqs_list

diff --git a/workflows/dram.nf b/workflows/dram.nf
@@ -90,6 +90,7 @@ workflow DRAM {
         use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all)
         use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all)
         use_dbcan = getDBFlag(anno_dbs, 'dbcan', value_for_all)
+        use_dbcan3 = getDBFlag(anno_dbs, 'dbcan3', value_for_all)
         use_camper = getDBFlag(anno_dbs, 'camper', value_for_all)
         use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all)
         use_methyl = getDBFlag(anno_dbs, 'methyl', value_for_all)
@@ -230,6 +231,7 @@ workflow DRAM {
             use_kegg,
             use_kofam,
             use_dbcan,
+            use_dbcan3,
             use_camper,
             use_fegenie,
             use_methyl,
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ dependencies: @@
       - scikit-bio=0.7.1
       - scipy<2
       - click<9.0
+      - pyhmmer