Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,14 @@ case "${SELECTION}" in
push_back "${TMP_PATH}/kalamari.fasta"
INPUT_TYPE="FASTA_LIST"
;;
*)
if [ ! -f "${SELECTION}" ]; then
fail "Local file not found: ${SELECTION}"
fi
push_back "${SELECTION}"
date "+%s" > "${TMP_PATH}/version"
INPUT_TYPE="LOCAL_FASTA"
;;
esac

if notExists "${OUTDB}.dbtype"; then
Expand All @@ -312,6 +320,42 @@ case "${INPUT_TYPE}" in
rm -f -- "$i"
done
;;
"LOCAL_FASTA")
eval "set -- $ARR"
export MMSEQS_FORCE_MERGE=1
# shellcheck disable=SC2086
set +e
CREATEDB_LOG=$("${MMSEQS}" createdb "${@}" "${TMP_PATH}/createdb" ${THREADS_COMP_PAR} 2>&1)
CREATEDB_RET=$?
set -e
printf "%s\n" "${CREATEDB_LOG}"
[ $CREATEDB_RET -ne 0 ] && fail "createdb died"
case "${CREATEDB_LOG}" in
*"Database type: Nucleotide"*) ;;
*) fail "Local file indexing only supports nucleotide databases" ;;
esac
# shellcheck disable=SC2086
"${MMSEQS}" splitsequence "${TMP_PATH}/createdb" "${TMP_PATH}/splitdb" \
--sequence-overlap 0 --sequence-split-mode 0 --headers-split-mode 1 \
${THREADS_PAR} \
|| fail "splitsequence died"
# shellcheck disable=SC2086
"${MMSEQS}" makepaddedseqdb "${TMP_PATH}/splitdb" "${OUTDB}" \
${THREADS_PAR} \
|| fail "makepaddedseqdb died"
mkdir -p "${TMP_PATH}/indexdb"
# shellcheck disable=SC2086
"${MMSEQS}" createindex "${OUTDB}" "${TMP_PATH}/indexdb" \
--split 1 --index-subset 2 ${THREADS_PAR} \
|| fail "createindex died"
rm -rf -- "${TMP_PATH}/indexdb"
if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"${MMSEQS}" rmdb "${TMP_PATH}/createdb" ${VERB_PAR}
# shellcheck disable=SC2086
"${MMSEQS}" rmdb "${TMP_PATH}/splitdb" ${VERB_PAR}
fi
;;
"FSA")
# shellcheck disable=SC2086
"${MMSEQS}" createdb "${TMP_PATH}/"*.fsa "${OUTDB}" ${COMP_PAR} --gpu 1 \
Expand Down
8 changes: 6 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,13 @@ std::vector<Command> baseCommands = {
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
{"databases", databases, &par.databases, COMMAND_DATABASE_CREATION,
"List and download databases",
NULL,
"# Download a known database\n"
"mmseqs databases UniRef30_2302 db tmp\n\n"
"# Index a local FASTA file (use ./relative or /absolute paths)\n"
"mmseqs databases ./input.fasta.gz db tmp\n"
"mmseqs databases /data/rnacentral_active.fasta.gz db tmp\n\n",
"Milot Mirdita <milot@mirdita.de>",
"<name> <o:sequenceDB> <tmpDir>",
"<name|path> <o:sequenceDB> <tmpDir>",
CITATION_TAXONOMY|CITATION_MMSEQS2, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty },
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
Expand Down
25 changes: 18 additions & 7 deletions src/workflow/Databases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,12 @@ int databases(int argc, const char **argv, const Command &command) {
}
}
if (downloadIdx == -1) {
par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str());
Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n";
EXIT(EXIT_FAILURE);
if (par.db1.find('/') == std::string::npos) {
par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str());
Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n";
EXIT(EXIT_FAILURE);
}
// Path contains '/' — treat as local file, shell script validates existence
}
par.printParameters(command.cmd, argc, argv, par.databases);
std::string tmpDir = par.db3;
Expand All @@ -291,10 +294,14 @@ int databases(int argc, const char **argv, const Command &command) {
par.filenames.push_back(tmpDir);

CommandCaller cmd;
for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) {
cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value);
if (downloadIdx >= 0) {
for (size_t i = 0; i < usedDownloads[downloadIdx].environment.size(); ++i) {
cmd.addVariable(usedDownloads[downloadIdx].environment[i].key, usedDownloads[downloadIdx].environment[i].value);
}
cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL);
} else {
cmd.addVariable("TAXONOMY", NULL);
}
cmd.addVariable("TAXONOMY", usedDownloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL);
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("VERB_PAR", par.createParameterString(par.onlyverbosity).c_str());
cmd.addVariable("COMP_PAR", par.createParameterString(par.verbandcompression).c_str());
Expand All @@ -303,7 +310,11 @@ int databases(int argc, const char **argv, const Command &command) {
cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
std::string program = tmpDir + "/download.sh";
FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength);
if (downloadIdx >= 0) {
FileUtil::writeFile(program, usedDownloads[downloadIdx].script, usedDownloads[downloadIdx].scriptLength);
} else {
FileUtil::writeFile(program, databases_sh, databases_sh_len);
}
cmd.execProgram(program.c_str(), par.filenames);

// Should never get here
Expand Down