From 0ca58da6bc435a959b9878b7dd8951576249a0f3 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 10 Apr 2026 21:46:58 +0000 Subject: [PATCH] databases: accept local FASTA paths for easy nucleotide DB creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building a nucleotide database from a FASTA file previously required manually chaining createdb, splitsequence, makepaddedseqdb, and createindex with the right flags. Now a single command does it: mmseqs databases ./input.fasta.gz outdb tmp Both relative (./...) and absolute (/...) paths work — any argument containing '/' that isn't a known database name is treated as a local file. Protein inputs are rejected with a clear error since the indexing pipeline is nucleotide-specific. This keeps `databases` as the single entry point to maintain indexing requirements, and makes it suitable for reindexing external or already manually downloaded databases. --- data/workflow/databases.sh | 44 ++++++++++++++++++++++++++++++++++++++ src/MMseqsBase.cpp | 8 +++++-- src/workflow/Databases.cpp | 25 ++++++++++++++++------ 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/data/workflow/databases.sh b/data/workflow/databases.sh index c093da1cc..fab3313c3 100644 --- a/data/workflow/databases.sh +++ b/data/workflow/databases.sh @@ -299,6 +299,14 @@ case "${SELECTION}" in push_back "${TMP_PATH}/kalamari.fasta" INPUT_TYPE="FASTA_LIST" ;; + *) + if [ ! -f "${SELECTION}" ]; then + fail "Local file not found: ${SELECTION}" + fi + push_back "${SELECTION}" + date "+%s" > "${TMP_PATH}/version" + INPUT_TYPE="LOCAL_FASTA" + ;; esac if notExists "${OUTDB}.dbtype"; then @@ -312,6 +320,42 @@ case "${INPUT_TYPE}" in rm -f -- "$i" done ;; + "LOCAL_FASTA") + eval "set -- $ARR" + export MMSEQS_FORCE_MERGE=1 + # shellcheck disable=SC2086 + set +e + CREATEDB_LOG=$("${MMSEQS}" createdb "${@}" "${TMP_PATH}/createdb" ${THREADS_COMP_PAR} 2>&1) + CREATEDB_RET=$? + set -e + printf "%s\n" "${CREATEDB_LOG}" + [ $CREATEDB_RET -ne 0 ] && fail "createdb died" + case "${CREATEDB_LOG}" in + *"Database type: Nucleotide"*) ;; + *) fail "Local file indexing only supports nucleotide databases" ;; + esac + # shellcheck disable=SC2086 + "${MMSEQS}" splitsequence "${TMP_PATH}/createdb" "${TMP_PATH}/splitdb" \ + --sequence-overlap 0 --sequence-split-mode 0 --headers-split-mode 1 \ + ${THREADS_PAR} \ + || fail "splitsequence died" + # shellcheck disable=SC2086 + "${MMSEQS}" makepaddedseqdb "${TMP_PATH}/splitdb" "${OUTDB}" \ + ${THREADS_PAR} \ + || fail "makepaddedseqdb died" + mkdir -p "${TMP_PATH}/indexdb" + # shellcheck disable=SC2086 + "${MMSEQS}" createindex "${OUTDB}" "${TMP_PATH}/indexdb" \ + --split 1 --index-subset 2 ${THREADS_PAR} \ + || fail "createindex died" + rm -rf -- "${TMP_PATH}/indexdb" + if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "${MMSEQS}" rmdb "${TMP_PATH}/createdb" ${VERB_PAR} + # shellcheck disable=SC2086 + "${MMSEQS}" rmdb "${TMP_PATH}/splitdb" ${VERB_PAR} + fi + ;; "FSA") # shellcheck disable=SC2086 "${MMSEQS}" createdb "${TMP_PATH}/"*.fsa "${OUTDB}" ${COMP_PAR} --gpu 1 \ diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 1fa167298..52dae74c0 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -129,9 +129,13 @@ std::vector baseCommands = { {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, {"databases", databases, &par.databases, COMMAND_DATABASE_CREATION, "List and download databases", - NULL, + "# Download a known database\n" + "mmseqs databases UniRef30_2302 db tmp\n\n" + "# Index a local FASTA file (use ./relative or /absolute paths)\n" + "mmseqs databases ./input.fasta.gz db tmp\n" + "mmseqs databases /data/rnacentral_active.fasta.gz db tmp\n\n", "Milot Mirdita ", - " ", + " ", CITATION_TAXONOMY|CITATION_MMSEQS2, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, diff --git a/src/workflow/Databases.cpp b/src/workflow/Databases.cpp index 2bbe05397..b60d21e6d 100644 --- a/src/workflow/Databases.cpp +++ b/src/workflow/Databases.cpp @@ -276,9 +276,12 @@ int databases(int argc, const char **argv, const Command &command) { } } if (downloadIdx == -1) { - par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str()); - Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n"; - EXIT(EXIT_FAILURE); + if (par.db1.find('/') == std::string::npos) { + par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str()); + Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n"; + EXIT(EXIT_FAILURE); + } + // Path contains '/' — treat as local file, shell script validates existence } par.printParameters(command.cmd, argc, argv, par.databases); std::string tmpDir = par.db3; @@ -291,10 +294,14 @@ int databases(int argc, const char **argv, const Command &command) { par.filenames.push_back(tmpDir); CommandCaller cmd; - for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) { - cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value); + if (downloadIdx >= 0) { + for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) { + cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value); + } + cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL); + } else { + cmd.addVariable("TAXONOMY", NULL); } - cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERB_PAR", par.createParameterString(par.onlyverbosity).c_str()); cmd.addVariable("COMP_PAR", par.createParameterString(par.verbandcompression).c_str()); @@ -303,7 +310,11 @@ int databases(int argc, const char **argv, const Command &command) { cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); std::string program = tmpDir + "/download.sh"; - FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength); + if (downloadIdx >= 0) { + FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength); + } else { + FileUtil::writeFile(program, databases_sh, databases_sh_len); + } cmd.execProgram(program.c_str(), par.filenames); // Should never get here