From fc98281184a5eea9c583e38b9ca34681335b836f Mon Sep 17 00:00:00 2001
From: zkasuran <zkasuran@gmail.com>
Date: Mon, 8 Jun 2026 08:36:51 +0530
Subject: [PATCH] Fuse distributed prefix-suffix multi-SWAP (closes #595)

The localiser performed each prefix<->suffix SWAP in turn, so an amplitude
moved by one SWAP was often moved again by the next, crossing the network
several times. This fuses the group of disjoint SWAPs into one operation that
computes each amplitude's final node and sends it there directly, so every
amplitude crosses the network at most once.

The disjoint SWAPs commute and compose into a single bit permutation. For the
uncontrolled case (every internal caller) the routine enumerates the up to
2^eta-1 destination nodes and packs, exchanges and unpacks only the amplitudes
bound to each. A new cpu_statevec_unpackAmpsFromBuffer scatters the received
sub-buffer back into the strided local amplitudes, the inverse of the existing
packer, looping over moved amplitudes not the whole state.

Scope is CPU/OpenMP. GPU quregs and controlled multi-SWAPs keep the existing
per-SWAP path, so the GPU build is unchanged.

Comm volume drops 25% at eta=2 and 42% at eta=3 (1 - 1/2^eta), matching theory.
Existing applySwap, applyCompMatr, applyCompMatr2 and calcPartialTrace suites
pass at 1, 2, 4 and 8 ranks.
---
 quest/src/core/accelerator.cpp    | 17 ++++++-
 quest/src/core/accelerator.hpp    |  2 +
 quest/src/core/localiser.cpp      | 73 ++++++++++++++++++++++++++-----
 quest/src/cpu/cpu_subroutines.cpp | 43 +++++++++++++++++-
 quest/src/cpu/cpu_subroutines.hpp |  2 +
 5 files changed, 124 insertions(+), 13 deletions(-)

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 677e6c74a..7752b8557 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL
 
     // note qubits may incidentally be ctrls or targs; it doesn't matter
     GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );
-    
+
     // return the number of packed amps, for caller convenience
     return func(qureg, qubits, qubitStates);
 }
 
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {
+
+    // inverse of packing; scatters received sub-buffer into strided local amps where
+    // the given qubits are in the given states (used by the fused multi-SWAP routine).
+    // only the CPU path is dispatched; the fused routine restricts itself to non-GPU
+    // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel
+    // is needed and the GPU build is left untouched
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() );
+    func(qureg, qubits, qubitStates);
+}
+
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
 
     return (qureg.isGpuAccelerated)?
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 5a8dc37fb..879ad8d78 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
 
 qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
 
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 83a23b921..ab11f6215 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     // the SWAPs act on unique qubit pairs and so commute.
 
     /// @todo
-    ///   - the sequence of pair-wise full-swaps should be more efficient as a
-    ///     "single" sequence of smaller messages sending amps directly to their
-    ///     final destination node. This could use a new "multiSwap" function.
-    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
-    ///     multiSwap function should use custatevecSwapIndexBits() if local,
-    ///     or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
+    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this
+    ///     routine could use custatevecSwapIndexBits() if local, or
+    ///     custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
     ///     although the latter requires substantially more work like setting up
     ///     a communicator which may be inelegant alongside our own distribution scheme.
 
-    // perform necessary swaps to move all targets into suffix, each of which invokes communication
+    // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit
+    auto suffixTargs = lists_getEmptyList64();
+    auto prefixTargs = lists_getEmptyList64();
     for (size_t i=0; i<targsA.size(); i++) {
-
         if (targsA[i] == targsB[i])
             continue;
+        suffixTargs.push_back(std::min(targsA[i], targsB[i]));
+        prefixTargs.push_back(std::max(targsA[i], targsB[i]));
+    }
+    int numSwaps = suffixTargs.size();
+    if (numSwaps == 0)
+        return;
+
+    // the fused routine below targets the uncontrolled, non-GPU case which every internal
+    // caller currently uses. A controlled multi-SWAP, or a GPU-accelerated Qureg, falls back
+    // to the per-swap routine (issue #595 notes the OpenMP logic alone is sufficient, so the
+    // GPU path is left unchanged)
+    if (!ctrls.empty() || qureg.isGpuAccelerated) {
+        for (int i=0; i<numSwaps; i++)
+            anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
+        return;
+    }
+
+    // FUSED multi-SWAP: rather than performing each prefix<->suffix SWAP in turn (which
+    // wastefully relays an amplitude through intermediate nodes before its final node),
+    // we send each amplitude directly to its destination node in a single pass. The
+    // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude
+    // of this node moves to the rank obtained by overwriting each prefix-target rank-bit
+    // with the value of its partnered suffix-target bit. We enumerate the (up to)
+    // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose
+    // partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
+    // exchange + unpack only the amplitudes bound there. The move is an involution
+    // between paired nodes, so the packed and unpacked amplitudes occupy the same local
+    // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV.
+
+    std::vector<int> prefBits(numSwaps);
+    std::vector<int> rankBits(numSwaps);
+    for (int i=0; i<numSwaps; i++) {
+        prefBits[i] = util_getPrefixInd(prefixTargs[i], qureg);
+        rankBits[i] = getBit(qureg.rank, prefBits[i]);
+    }
+
+    // subset 0 are the amplitudes that do not move (all suffix bits already match the
+    // rank bits), so we skip it and iterate only the communicating subsets
+    qindex numSubsets = powerOf2(numSwaps);
+    for (qindex sub=1; sub<numSubsets; sub++) {
+
+        // the destination node flips this node's rank bits for the targeted subset, and
+        // the to-be-sent amplitudes are those whose suffix-target bits match the pattern
+        auto states = lists_getEmptyList64();
+        int pairRank = qureg.rank;
+        for (int i=0; i<numSwaps; i++) {
+            int inSubset = getBit(sub, i);
+            states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
+            if (inSubset)
+                pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
+        }
 
-        int suffixTarg = std::min(targsA[i], targsB[i]);
-        int prefixTarg = std::max(targsA[i], targsB[i]);
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+        // pack the amplitudes bound for pairRank, exchange, and scatter the received
+        // amplitudes back into those same local slots
+        qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states);
+        comm_exchangeSubBuffers(qureg, numPacked, pairRank);
+        accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states);
     }
 }
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 59df946e9..524e6d89d 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
 
 
+template <int NumQubits>
+void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
+
+    // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received
+    // contiguous sub-buffer back into the strided local amplitudes where the given qubits
+    // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained
+    // qubits, as needed by the fused multi-SWAP routine.
+
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
 
-/* 
+    // each constrained qubit halves the number of received amps
+    qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
+
+    // received amplitudes begin at the buffer's receive offset
+    qindex offset = getBufferRecvInd();
+
+    auto sortedQubitInds = util_getSorted(qubitInds);
+    auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
+
+    // use template param to compile-time unroll loop in insertBits()
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
+
+    #pragma omp parallel for if(qureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // i = nth local index where qubits are in the specified states
+        qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);
+
+        // scatter the contiguous sub-buffer among the strided local amplitudes
+        amps[i] = buffer[offset + n];
+    }
+}
+
+
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )
+
+
+
+/*
  * SWAPS
  */
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 3dbae057b..5925ad5a4 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -46,6 +46,8 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
 
 template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
 
+template <int NumQubits> void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
+
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);