Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,28 @@ case "${SELECTION}" in
push_back "${TMP_PATH}/kalamari.fasta"
INPUT_TYPE="FASTA_LIST"
;;
"RNAcentral_current"|"RNAcentral_26_0")
case "${SELECTION}" in
"RNAcentral_current") RNACENTRAL_URL="https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz" ;;
"RNAcentral_26_0") RNACENTRAL_URL="https://ftp.ebi.ac.uk/pub/databases/RNAcentral/releases/26.0/sequences/rnacentral_active.fasta.gz" ;;
esac
DL_DIR="${DOWNLOAD_DIR:-${TMP_PATH}}/${SELECTION}"
mkdir -p "${DL_DIR}"
if notExists "${DL_DIR}/rnacentral_active.fasta.gz"; then
downloadFile "${RNACENTRAL_URL}" "${DL_DIR}/rnacentral_active.fasta.gz"
fi
date "+%s" > "${TMP_PATH}/version"
push_back "${DL_DIR}/rnacentral_active.fasta.gz"
INPUT_TYPE="LOCAL_FASTA"
;;
*)
if [ ! -f "${SELECTION}" ]; then
fail "Local file not found: ${SELECTION}"
fi
push_back "${SELECTION}"
date "+%s" > "${TMP_PATH}/version"
INPUT_TYPE="LOCAL_FASTA"
;;
esac

if notExists "${OUTDB}.dbtype"; then
Expand All @@ -312,6 +334,42 @@ case "${INPUT_TYPE}" in
rm -f -- "$i"
done
;;
"LOCAL_FASTA")
eval "set -- $ARR"
export MMSEQS_FORCE_MERGE=1
# shellcheck disable=SC2086
set +e
CREATEDB_LOG=$("${MMSEQS}" createdb "${@}" "${TMP_PATH}/createdb" ${THREADS_COMP_PAR} 2>&1)
CREATEDB_RET=$?
set -e
printf "%s\n" "${CREATEDB_LOG}"
[ $CREATEDB_RET -ne 0 ] && fail "createdb died"
case "${CREATEDB_LOG}" in
*"Database type: Nucleotide"*) ;;
*) fail "Local file indexing only supports nucleotide databases" ;;
esac
# shellcheck disable=SC2086
"${MMSEQS}" splitsequence "${TMP_PATH}/createdb" "${TMP_PATH}/splitdb" \
--sequence-overlap 0 --sequence-split-mode 0 --headers-split-mode 1 \
${THREADS_PAR} \
|| fail "splitsequence died"
# shellcheck disable=SC2086
"${MMSEQS}" makepaddedseqdb "${TMP_PATH}/splitdb" "${OUTDB}" \
${THREADS_PAR} \
|| fail "makepaddedseqdb died"
mkdir -p "${TMP_PATH}/indexdb"
# shellcheck disable=SC2086
"${MMSEQS}" createindex "${OUTDB}" "${TMP_PATH}/indexdb" \
--split 1 --index-subset 2 ${THREADS_PAR} \
|| fail "createindex died"
rm -rf -- "${TMP_PATH}/indexdb"
if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"${MMSEQS}" rmdb "${TMP_PATH}/createdb" ${VERB_PAR}
# shellcheck disable=SC2086
"${MMSEQS}" rmdb "${TMP_PATH}/splitdb" ${VERB_PAR}
fi
;;
"FSA")
# shellcheck disable=SC2086
"${MMSEQS}" createdb "${TMP_PATH}/"*.fsa "${OUTDB}" ${COMP_PAR} --gpu 1 \
Expand Down
8 changes: 6 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,13 @@ std::vector<Command> baseCommands = {
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
{"databases", databases, &par.databases, COMMAND_DATABASE_CREATION,
"List and download databases",
NULL,
"# Download a known database\n"
"mmseqs databases UniRef30_2302 db tmp\n\n"
"# Index a local FASTA file (use ./relative or /absolute paths)\n"
"mmseqs databases ./input.fasta.gz db tmp\n"
"mmseqs databases /data/rnacentral_active.fasta.gz db tmp\n\n",
"Milot Mirdita <milot@mirdita.de>",
"<name> <o:sequenceDB> <tmpDir>",
"<name|path> <o:sequenceDB> <tmpDir>",
CITATION_TAXONOMY|CITATION_MMSEQS2, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty },
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
Expand Down
3 changes: 3 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ Parameters::Parameters():
PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void *) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
PARAM_SPACED_KMER_PATTERN(PARAM_SPACED_KMER_PATTERN_ID, "--spaced-kmer-pattern", "Spaced k-mer pattern", "User-specified spaced k-mer pattern", typeid(std::string), (void *) &spacedKmerPattern, "^1[01]*1$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_LOCAL_TMP(PARAM_LOCAL_TMP_ID, "--local-tmp", "Local temporary path", "Path where some of the temporary files will be created", typeid(std::string), (void *) &localTmp, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_DOWNLOAD_DIR(PARAM_DOWNLOAD_DIR_ID, "--download-dir", "Download cache directory", "Persistent directory for downloaded files (avoids re-downloading)", typeid(std::string), (void *) &downloadDir, "", MMseqsParameter::COMMAND_MISC),
// alignment
PARAM_ALIGNMENT_MODE(PARAM_ALIGNMENT_MODE_ID, "--alignment-mode", "Alignment mode", "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id\n4: only ungapped alignment", typeid(int), (void *) &alignmentMode, "^[0-5]{1}$", MMseqsParameter::COMMAND_ALIGN),
PARAM_ALIGNMENT_OUTPUT_MODE(PARAM_ALIGNMENT_OUTPUT_MODE_ID, "--alignment-output-mode", "Alignment mode", "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id\n4: only ungapped alignment\n5: score only (output) cluster format", typeid(int), (void *) &alignmentOutputMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN),
Expand Down Expand Up @@ -1633,6 +1634,7 @@ Parameters::Parameters():
databases.push_back(&PARAM_COMPRESSED);
databases.push_back(&PARAM_THREADS);
databases.push_back(&PARAM_V);
databases.push_back(&PARAM_DOWNLOAD_DIR);

// tar2db
tar2db.push_back(&PARAM_OUTPUT_DBTYPE);
Expand Down Expand Up @@ -2527,6 +2529,7 @@ void Parameters::setDefaults() {
splitAA = false;
spacedKmerPattern = "";
localTmp = "";
downloadDir = "downloads";

// search workflow
numIterations = 1;
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ class Parameters {
int realignMaxSeqs; // Max alignments to realign
std::string spacedKmerPattern; // User-specified kmer pattern
std::string localTmp; // Local temporary path
std::string downloadDir; // Persistent download cache directory


// ALIGNMENT
Expand Down Expand Up @@ -824,6 +825,7 @@ class Parameters {
PARAMETER(PARAM_PRELOAD_MODE)
PARAMETER(PARAM_SPACED_KMER_PATTERN)
PARAMETER(PARAM_LOCAL_TMP)
PARAMETER(PARAM_DOWNLOAD_DIR)
std::vector<MMseqsParameter*> prefilter;
std::vector<MMseqsParameter*> ungappedprefilter;
std::vector<MMseqsParameter*> gappedprefilter;
Expand Down
53 changes: 46 additions & 7 deletions src/workflow/Databases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,20 @@ std::vector<DatabaseDownload> downloads = {{
"https://github.com/lskatz/Kalamari",
true, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
{ }
}, {
"RNAcentral_current",
"RNAcentral active sequences (latest release). Non-deterministic: contents change with each release.",
"RNAcentral Consortium: RNAcentral 2021: secondary structure integration, improved sequence search and new member databases. Nucleic Acids Res 49(D1), D212-D220 (2021)",
"https://rnacentral.org",
false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
{ }
}, {
"RNAcentral_26_0",
"RNAcentral active sequences (release 26.0, deterministic).",
"RNAcentral Consortium: RNAcentral 2021: secondary structure integration, improved sequence search and new member databases. Nucleic Acids Res 49(D1), D212-D220 (2021)",
"https://rnacentral.org",
false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
{ }
},
};

Expand Down Expand Up @@ -276,9 +290,12 @@ int databases(int argc, const char **argv, const Command &command) {
}
}
if (downloadIdx == -1) {
par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str());
Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n";
EXIT(EXIT_FAILURE);
if (par.db1.find('/') == std::string::npos) {
par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str());
Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n";
EXIT(EXIT_FAILURE);
}
// Path contains '/' — treat as local file, shell script validates existence
}
par.printParameters(command.cmd, argc, argv, par.databases);
std::string tmpDir = par.db3;
Expand All @@ -291,19 +308,41 @@ int databases(int argc, const char **argv, const Command &command) {
par.filenames.push_back(tmpDir);

CommandCaller cmd;
for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) {
cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value);
if (downloadIdx >= 0) {
for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) {
cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value);
}
cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL);
} else {
cmd.addVariable("TAXONOMY", NULL);
}
cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL);
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("VERB_PAR", par.createParameterString(par.onlyverbosity).c_str());
cmd.addVariable("COMP_PAR", par.createParameterString(par.verbandcompression).c_str());
// aria2c gives an (undocumented error with more than 16 connections)
cmd.addVariable("ARIA_NUM_CONN", SSTR(std::min(16, par.threads)).c_str());
cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
if (downloadIdx >= 0 && par.downloadDir.empty() == false) {
std::string dlDir = par.downloadDir;
// Resolve relative paths against CWD
if (dlDir[0] != '/') {
char cwd[PATH_MAX];
if (getcwd(cwd, sizeof(cwd)) != NULL) {
dlDir = std::string(cwd) + "/" + dlDir;
}
}
if (FileUtil::directoryExists(dlDir.c_str()) == false) {
FileUtil::makeDir(dlDir.c_str());
}
cmd.addVariable("DOWNLOAD_DIR", dlDir.c_str());
}
std::string program = tmpDir + "/download.sh";
FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength);
if (downloadIdx >= 0) {
FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength);
} else {
FileUtil::writeFile(program, databases_sh, databases_sh_len);
}
cmd.execProgram(program.c_str(), par.filenames);

// Should never get here
Expand Down