From 0ca58da6bc435a959b9878b7dd8951576249a0f3 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 10 Apr 2026 21:46:58 +0000 Subject: [PATCH 1/3] databases: accept local FASTA paths for easy nucleotide DB creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building a nucleotide database from a FASTA file previously required manually chaining createdb, splitsequence, makepaddedseqdb, and createindex with the right flags. Now a single command does it: mmseqs databases ./input.fasta.gz outdb tmp Both relative (./...) and absolute (/...) paths work — any argument containing '/' that isn't a known database name is treated as a local file. Protein inputs are rejected with a clear error since the indexing pipeline is nucleotide-specific. This keeps `databases` as the single entry point to maintain indexing requirements, and makes it suitable for reindexing external or already manually downloaded databases. --- data/workflow/databases.sh | 44 ++++++++++++++++++++++++++++++++++++++ src/MMseqsBase.cpp | 8 +++++-- src/workflow/Databases.cpp | 25 ++++++++++++++++------ 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/data/workflow/databases.sh b/data/workflow/databases.sh index c093da1cc..fab3313c3 100644 --- a/data/workflow/databases.sh +++ b/data/workflow/databases.sh @@ -299,6 +299,14 @@ case "${SELECTION}" in push_back "${TMP_PATH}/kalamari.fasta" INPUT_TYPE="FASTA_LIST" ;; + *) + if [ ! -f "${SELECTION}" ]; then + fail "Local file not found: ${SELECTION}" + fi + push_back "${SELECTION}" + date "+%s" > "${TMP_PATH}/version" + INPUT_TYPE="LOCAL_FASTA" + ;; esac if notExists "${OUTDB}.dbtype"; then @@ -312,6 +320,42 @@ case "${INPUT_TYPE}" in rm -f -- "$i" done ;; + "LOCAL_FASTA") + eval "set -- $ARR" + export MMSEQS_FORCE_MERGE=1 + # shellcheck disable=SC2086 + set +e + CREATEDB_LOG=$("${MMSEQS}" createdb "${@}" "${TMP_PATH}/createdb" ${THREADS_COMP_PAR} 2>&1) + CREATEDB_RET=$? + set -e + printf "%s\n" "${CREATEDB_LOG}" + [ $CREATEDB_RET -ne 0 ] && fail "createdb died" + case "${CREATEDB_LOG}" in + *"Database type: Nucleotide"*) ;; + *) fail "Local file indexing only supports nucleotide databases" ;; + esac + # shellcheck disable=SC2086 + "${MMSEQS}" splitsequence "${TMP_PATH}/createdb" "${TMP_PATH}/splitdb" \ + --sequence-overlap 0 --sequence-split-mode 0 --headers-split-mode 1 \ + ${THREADS_PAR} \ + || fail "splitsequence died" + # shellcheck disable=SC2086 + "${MMSEQS}" makepaddedseqdb "${TMP_PATH}/splitdb" "${OUTDB}" \ + ${THREADS_PAR} \ + || fail "makepaddedseqdb died" + mkdir -p "${TMP_PATH}/indexdb" + # shellcheck disable=SC2086 + "${MMSEQS}" createindex "${OUTDB}" "${TMP_PATH}/indexdb" \ + --split 1 --index-subset 2 ${THREADS_PAR} \ + || fail "createindex died" + rm -rf -- "${TMP_PATH}/indexdb" + if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "${MMSEQS}" rmdb "${TMP_PATH}/createdb" ${VERB_PAR} + # shellcheck disable=SC2086 + "${MMSEQS}" rmdb "${TMP_PATH}/splitdb" ${VERB_PAR} + fi + ;; "FSA") # shellcheck disable=SC2086 "${MMSEQS}" createdb "${TMP_PATH}/"*.fsa "${OUTDB}" ${COMP_PAR} --gpu 1 \ diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 1fa167298..52dae74c0 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -129,9 +129,13 @@ std::vector baseCommands = { {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, {"databases", databases, &par.databases, COMMAND_DATABASE_CREATION, "List and download databases", - NULL, + "# Download a known database\n" + "mmseqs databases UniRef30_2302 db tmp\n\n" + "# Index a local FASTA file (use ./relative or /absolute paths)\n" + "mmseqs databases ./input.fasta.gz db tmp\n" + "mmseqs databases /data/rnacentral_active.fasta.gz db tmp\n\n", "Milot Mirdita ", - " ", + " ", CITATION_TAXONOMY|CITATION_MMSEQS2, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, diff --git a/src/workflow/Databases.cpp b/src/workflow/Databases.cpp index 2bbe05397..b60d21e6d 100644 --- a/src/workflow/Databases.cpp +++ b/src/workflow/Databases.cpp @@ -276,9 +276,12 @@ int databases(int argc, const char **argv, const Command &command) { } } if (downloadIdx == -1) { - par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str()); - Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n"; - EXIT(EXIT_FAILURE); + if (par.db1.find('/') == std::string::npos) { + par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str()); + Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n"; + EXIT(EXIT_FAILURE); + } + // Path contains '/' — treat as local file, shell script validates existence } par.printParameters(command.cmd, argc, argv, par.databases); std::string tmpDir = par.db3; @@ -291,10 +294,14 @@ int databases(int argc, const char **argv, const Command &command) { par.filenames.push_back(tmpDir); CommandCaller cmd; - for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) { - cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value); + if (downloadIdx >= 0) { + for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) { + cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value); + } + cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL); + } else { + cmd.addVariable("TAXONOMY", NULL); } - cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERB_PAR", par.createParameterString(par.onlyverbosity).c_str()); cmd.addVariable("COMP_PAR", par.createParameterString(par.verbandcompression).c_str()); @@ -303,7 +310,11 @@ int databases(int argc, const char **argv, const Command &command) { cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); std::string program = tmpDir + "/download.sh"; - FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength); + if (downloadIdx >= 0) { + FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength); + } else { + FileUtil::writeFile(program, databases_sh, databases_sh_len); + } cmd.execProgram(program.c_str(), par.filenames); // Should never get here From c7c8f0e1bee8737e8de3d7667a3849d95ec326d3 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 11 Apr 2026 00:19:42 +0000 Subject: [PATCH 2/3] databases: add --download-dir for persistent download caching Add a --download-dir parameter (default: "downloads") that provides a persistent cache directory for downloaded files. This avoids re-downloading large files across runs. The directory is resolved to an absolute path and created if it does not exist. The shell script receives it as DOWNLOAD_DIR environment variable. --- src/commons/Parameters.cpp | 3 +++ src/commons/Parameters.h | 2 ++ src/workflow/Databases.cpp | 14 ++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index ddd41d2c4..5a5800979 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -70,6 +70,7 @@ Parameters::Parameters(): PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void *) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT), PARAM_SPACED_KMER_PATTERN(PARAM_SPACED_KMER_PATTERN_ID, "--spaced-kmer-pattern", "Spaced k-mer pattern", "User-specified spaced k-mer pattern", typeid(std::string), (void *) &spacedKmerPattern, "^1[01]*1$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), PARAM_LOCAL_TMP(PARAM_LOCAL_TMP_ID, "--local-tmp", "Local temporary path", "Path where some of the temporary files will be created", typeid(std::string), (void *) &localTmp, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), + PARAM_DOWNLOAD_DIR(PARAM_DOWNLOAD_DIR_ID, "--download-dir", "Download cache directory", "Persistent directory for downloaded files (avoids re-downloading)", typeid(std::string), (void *) &downloadDir, "", MMseqsParameter::COMMAND_MISC), // alignment PARAM_ALIGNMENT_MODE(PARAM_ALIGNMENT_MODE_ID, "--alignment-mode", "Alignment mode", "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id\n4: only ungapped alignment", typeid(int), (void *) &alignmentMode, "^[0-5]{1}$", MMseqsParameter::COMMAND_ALIGN), PARAM_ALIGNMENT_OUTPUT_MODE(PARAM_ALIGNMENT_OUTPUT_MODE_ID, "--alignment-output-mode", "Alignment mode", "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id\n4: only ungapped alignment\n5: score only (output) cluster format", typeid(int), (void *) &alignmentOutputMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN), @@ -1633,6 +1634,7 @@ Parameters::Parameters(): databases.push_back(&PARAM_COMPRESSED); databases.push_back(&PARAM_THREADS); databases.push_back(&PARAM_V); + databases.push_back(&PARAM_DOWNLOAD_DIR); // tar2db tar2db.push_back(&PARAM_OUTPUT_DBTYPE); @@ -2527,6 +2529,7 @@ void Parameters::setDefaults() { splitAA = false; spacedKmerPattern = ""; localTmp = ""; + downloadDir = "downloads"; // search workflow numIterations = 1; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 1e0eb1e41..d37c68103 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -445,6 +445,7 @@ class Parameters { int realignMaxSeqs; // Max alignments to realign std::string spacedKmerPattern; // User-specified kmer pattern std::string localTmp; // Local temporary path + std::string downloadDir; // Persistent download cache directory // ALIGNMENT @@ -824,6 +825,7 @@ class Parameters { PARAMETER(PARAM_PRELOAD_MODE) PARAMETER(PARAM_SPACED_KMER_PATTERN) PARAMETER(PARAM_LOCAL_TMP) + PARAMETER(PARAM_DOWNLOAD_DIR) std::vector prefilter; std::vector ungappedprefilter; std::vector gappedprefilter; diff --git a/src/workflow/Databases.cpp b/src/workflow/Databases.cpp index b60d21e6d..f278b3d19 100644 --- a/src/workflow/Databases.cpp +++ b/src/workflow/Databases.cpp @@ -309,6 +309,20 @@ int databases(int argc, const char **argv, const Command &command) { cmd.addVariable("ARIA_NUM_CONN", SSTR(std::min(16, par.threads)).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); + if (downloadIdx >= 0 && par.downloadDir.empty() == false) { + std::string dlDir = par.downloadDir; + // Resolve relative paths against CWD + if (dlDir[0] != '/') { + char cwd[PATH_MAX]; + if (getcwd(cwd, sizeof(cwd)) != NULL) { + dlDir = std::string(cwd) + "/" + dlDir; + } + } + if (FileUtil::directoryExists(dlDir.c_str()) == false) { + FileUtil::makeDir(dlDir.c_str()); + } + cmd.addVariable("DOWNLOAD_DIR", dlDir.c_str()); + } std::string program = tmpDir + "/download.sh"; if (downloadIdx >= 0) { FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength); From 7d98645d99353684b859d810dcbbb6d3fd982f87 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 11 Apr 2026 00:19:53 +0000 Subject: [PATCH 3/3] databases: add RNAcentral_current and RNAcentral_26_0 Add RNAcentral active sequences as downloadable databases: - RNAcentral_current: latest release (non-deterministic) - RNAcentral_26_0: pinned release 26.0 (deterministic) Both use the LOCAL_FASTA pipeline for nucleotide indexing and store downloads under // subdirectories. --- data/workflow/databases.sh | 14 ++++++++++++++ src/workflow/Databases.cpp | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/data/workflow/databases.sh b/data/workflow/databases.sh index fab3313c3..adb195b39 100644 --- a/data/workflow/databases.sh +++ b/data/workflow/databases.sh @@ -299,6 +299,20 @@ case "${SELECTION}" in push_back "${TMP_PATH}/kalamari.fasta" INPUT_TYPE="FASTA_LIST" ;; + "RNAcentral_current"|"RNAcentral_26_0") + case "${SELECTION}" in + "RNAcentral_current") RNACENTRAL_URL="https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz" ;; + "RNAcentral_26_0") RNACENTRAL_URL="https://ftp.ebi.ac.uk/pub/databases/RNAcentral/releases/26.0/sequences/rnacentral_active.fasta.gz" ;; + esac + DL_DIR="${DOWNLOAD_DIR:-${TMP_PATH}}/${SELECTION}" + mkdir -p "${DL_DIR}" + if notExists "${DL_DIR}/rnacentral_active.fasta.gz"; then + downloadFile "${RNACENTRAL_URL}" "${DL_DIR}/rnacentral_active.fasta.gz" + fi + date "+%s" > "${TMP_PATH}/version" + push_back "${DL_DIR}/rnacentral_active.fasta.gz" + INPUT_TYPE="LOCAL_FASTA" + ;; *) if [ ! -f "${SELECTION}" ]; then fail "Local file not found: ${SELECTION}" diff --git a/src/workflow/Databases.cpp b/src/workflow/Databases.cpp index f278b3d19..2eaad5a2d 100644 --- a/src/workflow/Databases.cpp +++ b/src/workflow/Databases.cpp @@ -157,6 +157,20 @@ std::vector downloads = {{ "https://github.com/lskatz/Kalamari", true, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len, { } + }, { + "RNAcentral_current", + "RNAcentral active sequences (latest release). Non-deterministic: contents change with each release.", + "RNAcentral Consortium: RNAcentral 2021: secondary structure integration, improved sequence search and new member databases. Nucleic Acids Res 49(D1), D212-D220 (2021)", + "https://rnacentral.org", + false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len, + { } + }, { + "RNAcentral_26_0", + "RNAcentral active sequences (release 26.0, deterministic).", + "RNAcentral Consortium: RNAcentral 2021: secondary structure integration, improved sequence search and new member databases. Nucleic Acids Res 49(D1), D212-D220 (2021)", + "https://rnacentral.org", + false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len, + { } }, };