Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions src/util/offsetalignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void updateOffset(char* data, std::vector<Matcher::result_t> &results, const Orf
res.queryOrfEndPos = -1;
res.dbOrfStartPos = -1;
res.dbOrfEndPos = -1;
if (targetNeedsUpdate == true || qloc == NULL) {
if (targetNeedsUpdate == true) {
size_t targetId = tOrfDBr.sequenceReader->getId(res.dbKey);
char *header = tOrfDBr.sequenceReader->getData(targetId, thread_idx);

Expand Down Expand Up @@ -219,6 +219,11 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
}
// const bool targetNucl = Parameters::isEqualDbtype(targetDbType, Parameters::DBTYPE_NUCLEOTIDES);
const bool targetNucl = true;
// When the original target DB and split target DB are the same path,
// the target was NOT split by the search pipeline (it was pre-split
// by makepaddedseqdb). In this case, target coordinates are already
// relative to individual chunks and must NOT be adjusted.
const bool targetWasSplit = (par.db3.compare(par.db4) != 0);
IndexReader *tSourceDbr = NULL;
bool isSameSrcDB = (par.db3.compare(par.db1) == 0);
bool isNuclNuclSearch = false;
Expand Down Expand Up @@ -399,9 +404,9 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
char *header = qOrfDbr.sequenceReader->getData(queryId, thread_idx);
Orf::SequenceLocation qloc = Orf::parseOrfHeader(header);
if(qloc.id == UINT_MAX){
updateOffset(data, results, NULL, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch), isNuclNuclSearch, thread_idx);
updateOffset(data, results, NULL, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch) && targetWasSplit, isNuclNuclSearch, thread_idx);
}else{
updateOffset(data, results, &qloc, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch), isNuclNuclSearch, thread_idx);
updateOffset(data, results, &qloc, *tOrfDbr, (isNuclNuclSearch||isTransNucTransNucSearch) && targetWasSplit, isNuclNuclSearch, thread_idx);
}
// do not merge entries
if(par.mergeQuery == false){
Expand Down Expand Up @@ -431,7 +436,7 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
qLen = qSourceDbr->sequenceReader->getSeqLen(queryId);
}
char *data = alnDbr.getData(i, thread_idx);
updateOffset(data, results, NULL, *tOrfDbr, true, isNuclNuclSearch, thread_idx);
updateOffset(data, results, NULL, *tOrfDbr, targetWasSplit, isNuclNuclSearch, thread_idx);
}
if(par.mergeQuery == true){
updateLengths(results, qLen, tSourceDbr);
Expand Down
159 changes: 159 additions & 0 deletions util/tests/offsetalignment_crash1/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Minimal repro for the offsetalignment crash (fixed in b9598041).

The bug triggers when:
1. Target DB is pre-split by makepaddedseqdb (sequences > 10000bp)
2. offsetalignment is called with db3 == db4 (no NEEDTARGETSPLIT)
3. It incorrectly remaps target keys and inflates dbStartPos
4. result2profile then accesses out-of-bounds positions -> segfault

Usage:
./run # auto-detect mmseqs from build/src/mmseqs
./run /path/to/mmseqs # use a specific binary
"""

import os
import random
import shutil
import subprocess
import sys
import tempfile


def find_mmseqs(argv):
if len(argv) >= 2:
p = os.path.realpath(argv[1])
if os.path.isfile(p) and os.access(p, os.X_OK):
return p
sys.exit(f"Error: {argv[1]} is not an executable")

script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.normpath(os.path.join(script_dir, "../../.."))
default = os.path.join(repo_root, "build", "src", "mmseqs")
if os.path.isfile(default) and os.access(default, os.X_OK):
return default
sys.exit(
f"mmseqs not found at {default}\n"
"Build first (cd build && make -j) or pass the path: ./run /path/to/mmseqs"
)


def generate_fastas(target_path, query_path, num_seqs=10, seq_len=15000,
query_len=8000, divergence=0.01, seed=42):
"""Generate target FASTA (long seqs, >10000bp ensures splitting) and a short query.

The query is a subsequence of the first target sequence (taken from the
second chunk region, positions 5000..5000+query_len) so the prefilter
reliably finds hits against split chunks.
"""
rng = random.Random(seed)
bases = "ACGT"

base_seq = "".join(rng.choice(bases) for _ in range(seq_len))

with open(target_path, "w") as f:
for i in range(num_seqs):
f.write(f">seq{i} length={seq_len}\n")
if i == 0:
seq = base_seq
else:
seq = list(base_seq)
for j in range(len(seq)):
if rng.random() < divergence:
seq[j] = rng.choice(bases.replace(seq[j], ""))
seq = "".join(seq)
for start in range(0, len(seq), 80):
f.write(seq[start:start + 80] + "\n")

# Query: short enough to NOT be split (<10000bp), taken from second
# chunk region so it matches split target chunks with from>0.
query_seq = base_seq[5000:5000 + query_len]
with open(query_path, "w") as f:
f.write(f">query length={query_len}\n")
for start in range(0, len(query_seq), 80):
f.write(query_seq[start:start + 80] + "\n")


def run_cmd(args, **kwargs):
subprocess.run(
args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=True, **kwargs,
)


def main():
mmseqs = find_mmseqs(sys.argv)
print(f"Using mmseqs: {mmseqs}")

tmpdir = tempfile.mkdtemp(prefix="offsetalign_crash_")
try:
rc = _run(mmseqs, tmpdir)
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
sys.exit(rc)


def _run(mmseqs, tmpdir):
fasta = os.path.join(tmpdir, "input.fasta")
query_fasta = os.path.join(tmpdir, "query.fasta")
db = os.path.join(tmpdir, "db")
split_db = os.path.join(tmpdir, "split_db")
padded_db = os.path.join(tmpdir, "padded_db")
query_db = os.path.join(tmpdir, "query_db")
result_db = os.path.join(tmpdir, "result_db")
search_tmp = os.path.join(tmpdir, "tmp")
os.makedirs(search_tmp, exist_ok=True)

# 1. Generate synthetic FASTA files
# Target: 10 seqs x 15000bp (> maxSeqLen 10000, guarantees splitting)
# Query: 8000bp subsequence from second chunk region (< 10000, won't be split)
print("Generating synthetic FASTA...")
generate_fastas(fasta, query_fasta)

# 2. Build pre-split + indexed target DB (manual pipeline)
# Target is pre-split by makepaddedseqdb, so NEEDTARGETSPLIT is not
# set in blastdigp.sh, causing db3 == db4 in offsetalignment.
print("Building target DB (createdb -> splitsequence -> makepaddedseqdb -> createindex)...")
run_cmd([mmseqs, "createdb", fasta, db, "--threads", "1"])
run_cmd([mmseqs, "splitsequence", db, split_db,
"--sequence-overlap", "0", "--sequence-split-mode", "0",
"--headers-split-mode", "1", "--max-seq-len", "10000",
"--threads", "1"])
run_cmd([mmseqs, "makepaddedseqdb", split_db, padded_db, "--threads", "1"])
run_cmd([mmseqs, "createindex", padded_db, search_tmp, "--split", "1", "--threads", "1"])
# Clean index tmp so search gets a fresh tmp dir
shutil.rmtree(search_tmp)
os.makedirs(search_tmp, exist_ok=True)

# 3. Create query DB
print("Creating query DB...")
run_cmd([mmseqs, "createdb", query_fasta, query_db, "--threads", "1"])

# 4. Run iterative search (--num-iterations 2 triggers offsetalignment + result2profile)
print("Running search (num-iterations=2, prefilter-mode=1, gpu=0)...")
proc = subprocess.run(
[mmseqs, "search", query_db, padded_db, result_db, search_tmp,
"--num-iterations", "2",
"-e", "0.1",
"--max-seqs", "300",
"--prefilter-mode", "1",
"--gpu", "0",
"--split", "1",
"--threads", "1"],
capture_output=True, text=True,
)

if proc.returncode == 0:
print("PASS (search completed without crash)")
return 0
else:
print(f"FAIL (search crashed: exit code {proc.returncode})")
if proc.stderr:
for line in proc.stderr.strip().split("\n")[-20:]:
print(f" {line}")
return 1


if __name__ == "__main__":
main()