From 9bdeb0c71ab0f4a9d0319735c90b5a6c564bfb8f Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@nvidia.com>
Date: Tue, 28 Apr 2026 01:57:38 +0000
Subject: [PATCH] Fix splitsequence soft-link mode crash with profile DBs

splitsequence --sequence-split-mode 1 (soft-link) wrote position-based
offsets directly as byte offsets into the index. This is correct for
sequence DBs (1 byte per position) but wrong for profile DBs where each
position occupies PROFILE_READIN_SIZE (27) bytes.

When blastdigp.sh runs iterative search on a long query (>10000nt),
extractqueryprofiles produces a dinucleotide profile and splitsequence
splits it in soft-link mode. The resulting index entries pointed to
misaligned byte offsets (e.g. byte 10000 instead of 10000*27=270000),
causing ungappedprefilter to read garbage profile data and segfault in
Sequence::mapProfile.

The bug was latent because the conditions rarely coincided:
  - Target DBs use hard-copy mode (--sequence-split-mode 0), unaffected
  - Query profiles only get split when the query exceeds maxSeqLen
  - Only the pskvins nucleotide pipeline creates profile DBs that pass
    through splitsequence soft-link mode

Fix: multiply startPos and len by PROFILE_READIN_SIZE when writing
soft-link index entries for profile DBs.
---
 src/util/splitsequence.cpp      |  14 ++-
 util/tests/prefilter_crash1/run | 164 ++++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100755 util/tests/prefilter_crash1/run
diff --git a/src/util/splitsequence.cpp b/src/util/splitsequence.cpp
index 3facb7eb4..eb0ae62d1 100644
--- a/src/util/splitsequence.cpp
+++ b/src/util/splitsequence.cpp
@@ -7,6 +7,7 @@
 #include "itoa.h"
 
 #include "Orf.h"
+#include "Sequence.h"
 
 #include <unistd.h>
 #include <climits>
@@ -27,6 +28,11 @@ int splitsequence(int argc, const char **argv, const Command& command) {
     }
     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, mode);
     reader.open(DBReader<unsigned int>::NOSORT);
+    const bool isProfile = Parameters::isEqualDbtype(reader.getDbtype(), Parameters::DBTYPE_HMM_PROFILE);
+    // For profile DBs, each position takes PROFILE_READIN_SIZE bytes in the data file.
+    // Soft-link mode writes raw byte offsets/lengths into the index, so we must
+    // convert position-based startPos/len to byte units.
+    const size_t bytesPerPos = isProfile ? Sequence::PROFILE_READIN_SIZE : 1;
     bool sizeLarger = false;
     for (size_t i = 0; i < reader.getSize(); i++) {
         sizeLarger |= (reader.getSeqLen(i) > par.maxSeqLen);
@@ -98,8 +104,12 @@ int splitsequence(int argc, const char **argv, const Command& command) {
                 size_t len = std::min(par.maxSeqLen, seqLen - (split * par.maxSeqLen - split*sequenceOverlap));
                 size_t startPos = split * par.maxSeqLen - split*sequenceOverlap;
                 if (par.sequenceSplitMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT) {
-                    // +2 to emulate the \n\0
-                    sequenceWriter.writeIndexEntry(key, reader.getOffset(i) + startPos, len+2, thread_idx);
+                    // Convert position-based offset/length to byte-based for the index.
+                    // For sequence DBs bytesPerPos==1; for profile DBs bytesPerPos==PROFILE_READIN_SIZE.
+                    // +1 for null terminator (profiles) or +2 for \n\0 (sequences).
+                    size_t byteOffset = reader.getOffset(i) + startPos * bytesPerPos;
+                    size_t byteLen = len * bytesPerPos + (isProfile ? 1 : 2);
+                    sequenceWriter.writeIndexEntry(key, byteOffset, byteLen, thread_idx);
                 } else {
                     sequenceWriter.writeStart(thread_idx);
                     sequenceWriter.writeAdd(data + startPos, len, thread_idx);
diff --git a/util/tests/prefilter_crash1/run b/util/tests/prefilter_crash1/run
new file mode 100755
index 000000000..bcb75a16e
--- /dev/null
+++ b/util/tests/prefilter_crash1/run
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Repro for ungappedprefilter crash with split query profiles against a
+pre-split (padded) target database.
+
+The crash occurs when:
+  1. Target DB is pre-split by makepaddedseqdb (sequences > 10000bp)
+  2. Query sequence is also > 10000bp, so blastdigp.sh splits the query
+     profile via splitsequence
+  3. ungappedprefilter segfaults on the split query profile + padded target
+
+Usage:
+    ./run                     # auto-detect mmseqs from build/src/mmseqs
+    ./run /path/to/mmseqs     # use a specific binary
+"""
+
+import os
+import random
+import shutil
+import subprocess
+import sys
+import tempfile
+
+
+def find_mmseqs(argv):
+    if len(argv) >= 2:
+        p = os.path.realpath(argv[1])
+        if os.path.isfile(p) and os.access(p, os.X_OK):
+            return p
+        sys.exit(f"Error: {argv[1]} is not an executable")
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.normpath(os.path.join(script_dir, "../../.."))
+    default = os.path.join(repo_root, "build", "src", "mmseqs")
+    if os.path.isfile(default) and os.access(default, os.X_OK):
+        return default
+    sys.exit(
+        f"mmseqs not found at {default}\n"
+        "Build first (cd build && make -j) or pass the path: ./run /path/to/mmseqs"
+    )
+
+
+def generate_fasta(path, num_seqs=10, seq_len=15000, divergence=0.01, seed=42):
+    """Generate a FASTA with near-identical long sequences (>10000bp ensures splitting)."""
+    rng = random.Random(seed)
+    bases = "ACGT"
+
+    base_seq = "".join(rng.choice(bases) for _ in range(seq_len))
+
+    with open(path, "w") as f:
+        for i in range(num_seqs):
+            f.write(f">seq{i} length={seq_len}\n")
+            if i == 0:
+                seq = base_seq
+            else:
+                seq = list(base_seq)
+                for j in range(len(seq)):
+                    if rng.random() < divergence:
+                        seq[j] = rng.choice(bases.replace(seq[j], ""))
+                seq = "".join(seq)
+            for start in range(0, len(seq), 80):
+                f.write(seq[start:start + 80] + "\n")
+
+
+def run_cmd(args, **kwargs):
+    subprocess.run(
+        args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+        check=True, **kwargs,
+    )
+
+
+def main():
+    mmseqs = find_mmseqs(sys.argv)
+    print(f"Using mmseqs: {mmseqs}")
+
+    tmpdir = tempfile.mkdtemp(prefix="prefilter_crash_")
+    try:
+        rc = _run(mmseqs, tmpdir)
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+    sys.exit(rc)
+
+
+def _run(mmseqs, tmpdir):
+    fasta = os.path.join(tmpdir, "input.fasta")
+    query_fasta = os.path.join(tmpdir, "query.fasta")
+    db = os.path.join(tmpdir, "db")
+    split_db = os.path.join(tmpdir, "split_db")
+    padded_db = os.path.join(tmpdir, "padded_db")
+    query_db = os.path.join(tmpdir, "query_db")
+    result_db = os.path.join(tmpdir, "result_db")
+    search_tmp = os.path.join(tmpdir, "tmp")
+    os.makedirs(search_tmp, exist_ok=True)
+
+    # 1. Generate synthetic FASTA: 10 seqs x 15000bp (> maxSeqLen 10000)
+    #    Both target and query are long, so query profile also gets split.
+    print("Generating synthetic FASTA...")
+    generate_fasta(fasta)
+
+    # 2. Extract first sequence as query (15000bp — will be split by search pipeline)
+    with open(fasta) as f:
+        lines = []
+        for line in f:
+            if line.startswith(">") and len(lines) > 0:
+                break
+            lines.append(line)
+    with open(query_fasta, "w") as f:
+        f.writelines(lines)
+
+    # 3. Build pre-split + indexed target DB (manual pipeline)
+    print("Building target DB (createdb -> splitsequence -> makepaddedseqdb -> createindex)...")
+    run_cmd([mmseqs, "createdb", fasta, db, "--threads", "1"])
+    run_cmd([mmseqs, "splitsequence", db, split_db,
+             "--sequence-overlap", "0", "--sequence-split-mode", "0",
+             "--headers-split-mode", "1", "--max-seq-len", "10000",
+             "--threads", "1"])
+    run_cmd([mmseqs, "makepaddedseqdb", split_db, padded_db, "--threads", "1"])
+    run_cmd([mmseqs, "createindex", padded_db, search_tmp, "--split", "1", "--threads", "1"])
+    shutil.rmtree(search_tmp)
+    os.makedirs(search_tmp, exist_ok=True)
+
+    # 4. Create query DB
+    print("Creating query DB...")
+    run_cmd([mmseqs, "createdb", query_fasta, query_db, "--threads", "1"])
+
+    # 5. Run iterative search (--num-iterations 2 triggers query profile splitting)
+    print("Running search (num-iterations=2, prefilter-mode=1, gpu=0)...")
+    proc = subprocess.run(
+        [mmseqs, "search", query_db, padded_db, result_db, search_tmp,
+         "--num-iterations", "2",
+         "-e", "0.1",
+         "--max-seqs", "300",
+         "--prefilter-mode", "1",
+         "--gpu", "0",
+         "--split", "1",
+         "--threads", "1"],
+        capture_output=True, text=True,
+    )
+
+    if proc.returncode != 0:
+        print(f"FAIL  (search crashed: exit code {proc.returncode})")
+        if proc.stderr:
+            for line in proc.stderr.strip().split("\n")[-20:]:
+                print(f"  {line}")
+        return 1
+
+    # 6. Verify the search actually found hits (query is seq0, target contains
+    #    seq0 itself plus 9 near-copies at ~1% divergence — must find hits)
+    result_m8 = os.path.join(tmpdir, "result.m8")
+    run_cmd([mmseqs, "convertalis", query_db, padded_db, result_db, result_m8,
+             "--threads", "1"])
+    with open(result_m8) as f:
+        hits = f.readlines()
+
+    if len(hits) == 0:
+        print("FAIL  (search produced no hits — split query was not matched)")
+        return 1
+
+    print(f"PASS  (search completed, {len(hits)} hits found)")
+    return 0
+
+
+if __name__ == "__main__":
+    main()