diff --git a/src/util/offsetalignment.cpp b/src/util/offsetalignment.cpp index 314575eba..58d132bca 100644 --- a/src/util/offsetalignment.cpp +++ b/src/util/offsetalignment.cpp @@ -98,7 +98,7 @@ void updateOffset(char* data, std::vector &results, const Orf res.queryOrfEndPos = -1; res.dbOrfStartPos = -1; res.dbOrfEndPos = -1; - if (targetNeedsUpdate == true || qloc == NULL) { + if (targetNeedsUpdate == true) { size_t targetId = tOrfDBr.sequenceReader->getId(res.dbKey); char *header = tOrfDBr.sequenceReader->getData(targetId, thread_idx); @@ -219,6 +219,11 @@ int offsetalignment(int argc, const char **argv, const Command &command) { } // const bool targetNucl = Parameters::isEqualDbtype(targetDbType, Parameters::DBTYPE_NUCLEOTIDES); const bool targetNucl = true; + // When the original target DB and split target DB are the same path, + // the target was NOT split by the search pipeline (it was pre-split + // by makepaddedseqdb). In this case, target coordinates are already + // relative to individual chunks and must NOT be adjusted. + const bool targetWasSplit = (par.db3.compare(par.db4) != 0); IndexReader *tSourceDbr = NULL; bool isSameSrcDB = (par.db3.compare(par.db1) == 0); bool isNuclNuclSearch = false; @@ -399,9 +404,9 @@ int offsetalignment(int argc, const char **argv, const Command &command) { char *header = qOrfDbr.sequenceReader->getData(queryId, thread_idx); Orf::SequenceLocation qloc = Orf::parseOrfHeader(header); if(qloc.id == UINT_MAX){ - updateOffset(data, results, NULL, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch), isNuclNuclSearch, thread_idx); + updateOffset(data, results, NULL, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch) && targetWasSplit, isNuclNuclSearch, thread_idx); }else{ - updateOffset(data, results, &qloc, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch), isNuclNuclSearch, thread_idx); + updateOffset(data, results, &qloc, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch) && targetWasSplit, isNuclNuclSearch, thread_idx); } // do not merge entries if(par.mergeQuery == false){ @@ -431,7 +436,7 @@ int offsetalignment(int argc, const char **argv, const Command &command) { qLen = qSourceDbr->sequenceReader->getSeqLen(queryId); } char *data = alnDbr.getData(i, thread_idx); - updateOffset(data, results, NULL, *tOrfDbr, true, isNuclNuclSearch, thread_idx); + updateOffset(data, results, NULL, *tOrfDbr, targetWasSplit, isNuclNuclSearch, thread_idx); } if(par.mergeQuery == true){ updateLengths(results, qLen, tSourceDbr); diff --git a/util/tests/offsetalignment_crash1/run b/util/tests/offsetalignment_crash1/run new file mode 100755 index 000000000..8c6d55852 --- /dev/null +++ b/util/tests/offsetalignment_crash1/run @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Minimal repro for the offsetalignment crash (fixed in b9598041). + +The bug triggers when: + 1. Target DB is pre-split by makepaddedseqdb (sequences > 10000bp) + 2. offsetalignment is called with db3 == db4 (no NEEDTARGETSPLIT) + 3. It incorrectly remaps target keys and inflates dbStartPos + 4. result2profile then accesses out-of-bounds positions -> segfault + +Usage: + ./run # auto-detect mmseqs from build/src/mmseqs + ./run /path/to/mmseqs # use a specific binary +""" + +import os +import random +import shutil +import subprocess +import sys +import tempfile + + +def find_mmseqs(argv): + if len(argv) >= 2: + p = os.path.realpath(argv[1]) + if os.path.isfile(p) and os.access(p, os.X_OK): + return p + sys.exit(f"Error: {argv[1]} is not an executable") + + script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_root = os.path.normpath(os.path.join(script_dir, "../../..")) + default = os.path.join(repo_root, "build", "src", "mmseqs") + if os.path.isfile(default) and os.access(default, os.X_OK): + return default + sys.exit( + f"mmseqs not found at {default}\n" + "Build first (cd build && make -j) or pass the path: ./run /path/to/mmseqs" + ) + + +def generate_fastas(target_path, query_path, num_seqs=10, seq_len=15000, + query_len=8000, divergence=0.01, seed=42): + """Generate target FASTA (long seqs, >10000bp ensures splitting) and a short query. + + The query is a subsequence of the first target sequence (taken from the + second chunk region, positions 5000..5000+query_len) so the prefilter + reliably finds hits against split chunks. + """ + rng = random.Random(seed) + bases = "ACGT" + + base_seq = "".join(rng.choice(bases) for _ in range(seq_len)) + + with open(target_path, "w") as f: + for i in range(num_seqs): + f.write(f">seq{i} length={seq_len}\n") + if i == 0: + seq = base_seq + else: + seq = list(base_seq) + for j in range(len(seq)): + if rng.random() < divergence: + seq[j] = rng.choice(bases.replace(seq[j], "")) + seq = "".join(seq) + for start in range(0, len(seq), 80): + f.write(seq[start:start + 80] + "\n") + + # Query: short enough to NOT be split (<10000bp), taken from second + # chunk region so it matches split target chunks with from>0. + query_seq = base_seq[5000:5000 + query_len] + with open(query_path, "w") as f: + f.write(f">query length={query_len}\n") + for start in range(0, len(query_seq), 80): + f.write(query_seq[start:start + 80] + "\n") + + +def run_cmd(args, **kwargs): + subprocess.run( + args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + check=True, **kwargs, + ) + + +def main(): + mmseqs = find_mmseqs(sys.argv) + print(f"Using mmseqs: {mmseqs}") + + tmpdir = tempfile.mkdtemp(prefix="offsetalign_crash_") + try: + rc = _run(mmseqs, tmpdir) + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + sys.exit(rc) + + +def _run(mmseqs, tmpdir): + fasta = os.path.join(tmpdir, "input.fasta") + query_fasta = os.path.join(tmpdir, "query.fasta") + db = os.path.join(tmpdir, "db") + split_db = os.path.join(tmpdir, "split_db") + padded_db = os.path.join(tmpdir, "padded_db") + query_db = os.path.join(tmpdir, "query_db") + result_db = os.path.join(tmpdir, "result_db") + search_tmp = os.path.join(tmpdir, "tmp") + os.makedirs(search_tmp, exist_ok=True) + + # 1. Generate synthetic FASTA files + # Target: 10 seqs x 15000bp (> maxSeqLen 10000, guarantees splitting) + # Query: 8000bp subsequence from second chunk region (< 10000, won't be split) + print("Generating synthetic FASTA...") + generate_fastas(fasta, query_fasta) + + # 2. Build pre-split + indexed target DB (manual pipeline) + # Target is pre-split by makepaddedseqdb, so NEEDTARGETSPLIT is not + # set in blastdigp.sh, causing db3 == db4 in offsetalignment. + print("Building target DB (createdb -> splitsequence -> makepaddedseqdb -> createindex)...") + run_cmd([mmseqs, "createdb", fasta, db, "--threads", "1"]) + run_cmd([mmseqs, "splitsequence", db, split_db, + "--sequence-overlap", "0", "--sequence-split-mode", "0", + "--headers-split-mode", "1", "--max-seq-len", "10000", + "--threads", "1"]) + run_cmd([mmseqs, "makepaddedseqdb", split_db, padded_db, "--threads", "1"]) + run_cmd([mmseqs, "createindex", padded_db, search_tmp, "--split", "1", "--threads", "1"]) + # Clean index tmp so search gets a fresh tmp dir + shutil.rmtree(search_tmp) + os.makedirs(search_tmp, exist_ok=True) + + # 3. Create query DB + print("Creating query DB...") + run_cmd([mmseqs, "createdb", query_fasta, query_db, "--threads", "1"]) + + # 4. Run iterative search (--num-iterations 2 triggers offsetalignment + result2profile) + print("Running search (num-iterations=2, prefilter-mode=1, gpu=0)...") + proc = subprocess.run( + [mmseqs, "search", query_db, padded_db, result_db, search_tmp, + "--num-iterations", "2", + "-e", "0.1", + "--max-seqs", "300", + "--prefilter-mode", "1", + "--gpu", "0", + "--split", "1", + "--threads", "1"], + capture_output=True, text=True, + ) + + if proc.returncode == 0: + print("PASS (search completed without crash)") + return 0 + else: + print(f"FAIL (search crashed: exit code {proc.returncode})") + if proc.stderr: + for line in proc.stderr.strip().split("\n")[-20:]: + print(f" {line}") + return 1 + + +if __name__ == "__main__": + main()