diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index c59334b55..be80d437a 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -17,6 +17,7 @@
 #include "quest/src/core/autodeployer.hpp"
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/randomiser.hpp"
+#include "quest/src/core/accelerator.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
@@ -459,6 +460,9 @@ void finalizeQuESTEnv() {
     // calling this will not automatically
     // free the memory of existing Quregs
 
+    // free the persistent fused-multi-swap staging workspace (host or device), if any
+    accel_clearFusedSwapSendCache();
+
     if (global_envPtr->isGpuAccelerated)
         gpu_clearCache(); // syncs first
 
diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index cf6956454..31a317118 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -242,6 +242,45 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
 }
 
 
+void exchangeArraysWithMultiplePartners(
+    qcomp* send, qcomp* recv,
+    const vector<int>& partnerRanks,
+    const vector<qindex>& sendInds, const vector<qindex>& recvInds,
+    qindex numAmpsPerBlock
+) {
+#if QUEST_COMPILE_MPI
+
+    MPI_Comm mpiComm = comm_getMpiComm();
+
+    // each per-partner block is divided into power-of-2 messages; we issue ALL of every
+    // partner's asynchronous send/receives up-front (a personalised all-to-all) and wait
+    // once, so that a fused multi-swap completes in a single communication round
+    auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(numAmpsPerBlock);
+
+    vector<MPI_Request> requests(2 * partnerRanks.size() * numMessages, MPI_REQUEST_NULL);
+    size_t r = 0;
+
+    for (size_t b=0; b<partnerRanks.size(); b++) {
+        int pairRank = partnerRanks[b];
+
+        // messages to/from distinct partners share tags safely (distinguished by rank);
+        // within a partner, unique tags permit out-of-order arrival (UCX adaptive-routing)
+        for (qindex m=0; m<numMessages; m++) {
+            int tag = static_cast<int>(m);
+            MPI_Irecv(&recv[recvInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]);
+            MPI_Isend(&send[sendInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]);
+        }
+    }
+
+    // single wait completes the whole round (MPI will automatically free the request memory)
+    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+
+#else
+    error_commButEnvNotDistributed();
+#endif
+}
+
+
 
 /*
  * PRIVATE ASYNC SEND AND RECEIVE
@@ -528,11 +567,50 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {
 
     if (qureg.isGpuAccelerated)
         exchangeGpuSubBuffers(qureg, numAmps, pairRank);
-    else 
+    else
         exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank);
 }
 
 
+void comm_exchangeAmpsToBuffersForFusedSwap(
+    Qureg qureg, qcomp* sendBuf,
+    vector<int> partnerRanks, vector<qindex> blockSendInds, vector<qindex> blockRecvInds,
+    qindex numAmpsPerBlock
+) {
+    assert_commQuregIsDistributed(qureg);
+    for (int pairRank : partnerRanks)
+        assert_pairRankIsDistinct(qureg, pairRank);
+
+    // 'sendBuf' is a contiguous staging buffer (in the qureg's memory space) holding one
+    // packed block per subcube partner; received blocks are written into the qureg's
+    // commBuffer. The total staged payload is < numAmpsPerNode (the self-block never moves).
+    qindex sendSpan = 0;
+    qindex recvSpan = 0;
+    for (size_t b=0; b<partnerRanks.size(); b++) {
+        sendSpan = std::max(sendSpan, blockSendInds[b] + numAmpsPerBlock);
+        recvSpan = std::max(recvSpan, blockRecvInds[b] + numAmpsPerBlock);
+    }
+
+    // non-GPU quregs exchange host staging buffer to host commBuffer directly
+    if (!qureg.isGpuAccelerated) {
+        exchangeArraysWithMultiplePartners(sendBuf, qureg.cpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
+        return;
+    }
+
+    // GPU quregs exchange VRAM directly when supported
+    if (gpu_isDirectGpuCommPossible()) {
+        gpu_sync();
+        exchangeArraysWithMultiplePartners(sendBuf, qureg.gpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
+        return;
+    }
+
+    // otherwise route VRAM through RAM, reusing cpuAmps (mere host mirror in GPU mode) as send scratch
+    gpu_copyGpuToCpu(qureg, sendBuf, qureg.cpuAmps, sendSpan);
+    exchangeArraysWithMultiplePartners(qureg.cpuAmps, qureg.cpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
+    gpu_copyCpuToGpu(qureg, qureg.cpuCommBuffer, qureg.gpuCommBuffer, recvSpan);
+}
+
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {
 
     auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);
diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp
index e75e889f6..bd62abf0b 100644
--- a/quest/src/comm/comm_routines.hpp
+++ b/quest/src/comm/comm_routines.hpp
@@ -31,6 +31,8 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);
 
 void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);
 
+void comm_exchangeAmpsToBuffersForFusedSwap(Qureg qureg, qcomp* sendBuf, vector<int> partnerRanks, vector<qindex> blockSendInds, vector<qindex> blockRecvInds, qindex numAmpsPerBlock);
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);
 
 void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 677e6c74a..b5503ab81 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -246,6 +246,71 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int
 }
 
 
+// persistent, lazily-grown staging workspace reused across fused multi-swaps, so we
+// avoid a large (de)allocation (host malloc or cudaMalloc) on every call. This mirrors
+// QuEST's existing 'gpuCache' for the dense-matrix kernel, and the persistent workspaces
+// of cuStateVec / mpiQulacs. It is freed by accel_clearFusedSwapSendCache(), invoked at
+// environment teardown (finalizeQuESTEnv).
+static qcomp* fusedSwapSendCache    = nullptr;
+static qindex fusedSwapSendCacheLen = 0;
+static bool   fusedSwapSendCacheIsGpu = false;
+
+
+qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps) {
+
+    // discard a stale cache if the memory space (RAM vs VRAM) has changed
+    if (fusedSwapSendCache != nullptr && fusedSwapSendCacheIsGpu != qureg.isGpuAccelerated)
+        accel_clearFusedSwapSendCache();
+
+    // reuse the existing workspace when already large enough
+    if (numAmps <= fusedSwapSendCacheLen)
+        return fusedSwapSendCache;
+
+    // otherwise grow it (freeing the old, smaller buffer first)
+    if (fusedSwapSendCache != nullptr)
+        (fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache);
+
+    fusedSwapSendCacheIsGpu = qureg.isGpuAccelerated;
+    fusedSwapSendCache = (fusedSwapSendCacheIsGpu)? gpu_allocArray(numAmps) : cpu_allocArray(numAmps);
+    fusedSwapSendCacheLen = numAmps;
+    return fusedSwapSendCache;
+}
+
+
+void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer) {
+
+    // no-op: the staging workspace persists and is reused by subsequent fused
+    // multi-swaps; it is released at environment teardown. Kept for call-site symmetry.
+    (void) qureg;
+    (void) buffer;
+}
+
+
+void accel_clearFusedSwapSendCache() {
+
+    if (fusedSwapSendCache == nullptr)
+        return;
+
+    (fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache);
+    fusedSwapSendCache = nullptr;
+    fusedSwapSendCacheLen = 0;
+}
+
+
+void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) {
+
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsForFusedSwap, qureg, qubits.size() );
+    func(qureg, qubits, qubitStates, sendBuf, sendOffset);
+}
+
+
+void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) {
+
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_unpackAmpsForFusedSwap, qureg, qubits.size() );
+    func(qureg, qubits, qubitStates, recvOffset);
+}
+
+
 
 /*
  * SWAPS
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 5a8dc37fb..456592b42 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -173,6 +173,13 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL
 
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
+qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps);
+void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer);
+void accel_clearFusedSwapSendCache();
+
+void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset);
+void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset);
+
 
 /*
  * SWAPS
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 83a23b921..ccfc641f2 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -34,6 +34,7 @@
 #include <complex>
 #include <algorithm>
 #include <unordered_map>
+#include <cstdlib>
 
 using std::vector;
 using std::tuple;
@@ -893,6 +894,107 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64
  */
 
 
+void fusedMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) {
+
+    // we fuse 'numPairs' (>= 2) disjoint prefix<->suffix SWAPs into a single communication
+    // round. The 2^numPairs ranks differing from ours only in the swapped prefix qubits'
+    // rank-bits form a subcube; identifying a rank by the 'numPairs'-bit address of those
+    // rank-bits, the multi-swap relocates a local amp with suffix-target bit-pattern 'v' on
+    // rank-address 'a' to rank-address 'v' with new pattern 'a'. Hence the block with v==a
+    // stays put, and for each of the 2^numPairs - 1 partners we swap the block whose suffix
+    // pattern equals the partner's address. These index sets are disjoint, so all partner
+    // exchanges proceed concurrently. See arXiv:2311.01512, arXiv:quant-ph/0608239.
+
+    int numPairs = prefixTargs.size();
+
+    // rank-bit positions of the prefix (global) targets, and our rank's bit values there
+    vector<int> rankBitInds(numPairs);
+    vector<int> myAddrBits(numPairs);
+    for (int i=0; i<numPairs; i++) {
+        rankBitInds[i] = util_getPrefixInd(prefixTargs[i], qureg);
+        myAddrBits[i]  = util_getRankBitOfQubit(prefixTargs[i], qureg);
+    }
+
+    // each control and each swap-target halves the per-partner block of swapped local amps
+    qindex numAmpsPerBlock = qureg.numAmpsPerNode / powerOf2(numPairs + ctrls.size());
+
+    // packing constrains the swap-targets (to a partner-specific pattern) and the controls
+    List64 packQubits = ctrls;
+    for (int i=0; i<numPairs; i++)
+        packQubits.push_back(suffixTargs[i]);
+
+    // there are 2^numPairs - 1 partners (non-empty subsets of flipped rank-bits)
+    int numPartners = powerOf2(numPairs) - 1;
+    vector<int>    partnerRanks(numPartners);
+    vector<qindex> blockInds(numPartners);
+
+    // stage all partners' send-blocks contiguously in a temporary buffer (< one node's state)
+    qcomp* sendBuf = accel_allocFusedSwapSendBuffer(qureg, numPartners * numAmpsPerBlock);
+
+    // pack each partner's block, identified by subset 'sub' of flipped rank-bits
+    for (int sub=1; sub<=numPartners; sub++) {
+
+        int partner = sub - 1;
+        blockInds[partner] = partner * numAmpsPerBlock;
+
+        int pairRank = qureg.rank;
+        List64 packStates = ctrlStates;
+
+        for (int i=0; i<numPairs; i++) {
+            int flip = (sub >> i) & 1;
+            if (flip)
+                pairRank = flipBit(pairRank, rankBitInds[i]);
+
+            // partner's block holds local amps whose suffix-target[i] == partner's address bit i
+            packStates.push_back(myAddrBits[i] ^ flip);
+        }
+
+        partnerRanks[partner] = pairRank;
+        accel_statevec_packAmpsForFusedSwap(qureg, packQubits, packStates, sendBuf, blockInds[partner]);
+    }
+
+    // single all-to-all round: send each block to its partner, receive theirs into commBuffer
+    comm_exchangeAmpsToBuffersForFusedSwap(qureg, sendBuf, partnerRanks, blockInds, blockInds, numAmpsPerBlock);
+
+    // scatter each received block back into the same local indices it was packed from
+    for (int sub=1; sub<=numPartners; sub++) {
+
+        int partner = sub - 1;
+        List64 packStates = ctrlStates;
+        for (int i=0; i<numPairs; i++)
+            packStates.push_back(myAddrBits[i] ^ ((sub >> i) & 1));
+
+        accel_statevec_unpackAmpsForFusedSwap(qureg, packQubits, packStates, blockInds[partner]);
+    }
+
+    accel_deallocFusedSwapSendBuffer(qureg, sendBuf);
+}
+
+
+void multiSwapSequentially(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) {
+
+    // reference (pre-fusion) behaviour: perform each disjoint prefix<->suffix SWAP one-at-a-time,
+    // wastefully relocating amplitudes once per swap. Retained for correctness comparison and
+    // benchmarking against the fused single-round path; never the default at runtime.
+    for (size_t i=0; i<prefixTargs.size(); i++)
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
+}
+
+
+bool localiser_isFusedSwapDisabled() {
+
+    // honour an optional environment toggle (QUEST_DISABLE_SWAP_FUSION=1) so benchmarks can
+    // compare the fused single-round path against the legacy sequential path without recompiling.
+    // read once and cached; default (unset) keeps fusion enabled.
+    static int cached = -1;
+    if (cached == -1) {
+        const char* env = std::getenv("QUEST_DISABLE_SWAP_FUSION");
+        cached = (env != nullptr && env[0] == '1') ? 1 : 0;
+    }
+    return cached == 1;
+}
+
+
 void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targsA, ConstList64 targsB) {
 
     // this is an internal function called by the below routines which require
@@ -900,25 +1002,41 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     // the SWAPs act on unique qubit pairs and so commute.
 
     /// @todo
-    ///   - the sequence of pair-wise full-swaps should be more efficient as a
-    ///     "single" sequence of smaller messages sending amps directly to their
-    ///     final destination node. This could use a new "multiSwap" function.
     ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
-    ///     multiSwap function should use custatevecSwapIndexBits() if local,
-    ///     or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
+    ///     multiSwap could instead use custatevecSwapIndexBits() if local, or
+    ///     custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
     ///     although the latter requires substantially more work like setting up
     ///     a communicator which may be inelegant alongside our own distribution scheme.
 
-    // perform necessary swaps to move all targets into suffix, each of which invokes communication
-    for (size_t i=0; i<targsA.size(); i++) {
+    // collect the disjoint pairs needing communication; for each, the suffix (local)
+    // qubit is the smaller index and the prefix (global) qubit is the larger
+    List64 prefixTargs = lists_getEmptyList64();
+    List64 suffixTargs = lists_getEmptyList64();
 
+    for (size_t i=0; i<targsA.size(); i++) {
         if (targsA[i] == targsB[i])
             continue;
+        suffixTargs.push_back(std::min(targsA[i], targsB[i]));
+        prefixTargs.push_back(std::max(targsA[i], targsB[i]));
+    }
 
-        int suffixTarg = std::min(targsA[i], targsB[i]);
-        int prefixTarg = std::max(targsA[i], targsB[i]);
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+    int numPairs = prefixTargs.size();
+
+    // nothing to do, or a single swap reduces to the existing routine (needs no staging buffer)
+    if (numPairs == 0)
+        return;
+    if (numPairs == 1) {
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[0], prefixTargs[0]);
+        return;
     }
+
+    // multiple swaps are fused, sending each amplitude directly to its final node in a single
+    // round, rather than wastefully relocating it once per sequential swap. The benchmark toggle
+    // permits comparing against the legacy sequential behaviour.
+    if (localiser_isFusedSwapDisabled())
+        multiSwapSequentially(qureg, ctrls, ctrlStates, prefixTargs, suffixTargs);
+    else
+        fusedMultiSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, prefixTargs, suffixTargs);
 }
 
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 59df946e9..d8b331229 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -285,7 +285,82 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffe
 
 
 
-/* 
+/*
+ * FUSED SWAP BUFFER PACKING
+ *
+ * which pack/unpack one destination-pattern block of a fused multi-swap into/from
+ * an explicitly given buffer at an explicit offset. Unlike the routines above which
+ * always use qureg's commBuffer at fixed offsets, these enable many disjoint blocks
+ * (one per subcube partner) to be staged contiguously for a single all-to-all round.
+ */
+
+
+template <int NumQubits>
+void cpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits);
+
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(sendBuf);
+
+    // each constrained qubit (ctrl or swap-target) halves the needed iterations
+    qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size());
+
+    auto sortedQubits   = util_getSorted(qubits);
+    auto qubitStateMask = util_getBitMask(qubits, qubitStates);
+
+    // use template param to compile-time unroll loop in insertBits()
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
+
+    #pragma omp parallel for if(qureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // i = nth local index where qubits are in specified states
+        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
+
+        // pack the potentially-strided amplitudes into the contiguous send block
+        buffer[sendOffset + n] = amps[i];
+    }
+}
+
+
+template <int NumQubits>
+void cpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits);
+
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
+    // each constrained qubit (ctrl or swap-target) halves the needed iterations
+    qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size());
+
+    auto sortedQubits   = util_getSorted(qubits);
+    auto qubitStateMask = util_getBitMask(qubits, qubitStates);
+
+    // use template param to compile-time unroll loop in insertBits()
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
+
+    #pragma omp parallel for if(qureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // i = nth local index where qubits are in specified states (same indices that were packed)
+        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
+
+        // overwrite the strided local amplitudes with the received contiguous block
+        amps[i] = buffer[recvOffset + n];
+    }
+}
+
+
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_packAmpsForFusedSwap,   (Qureg, ConstList64, ConstList64, qcomp*, qindex) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsForFusedSwap, (Qureg, ConstList64, ConstList64, qindex) )
+
+
+
+/*
  * SWAPS
  */
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 3dbae057b..1c18996db 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -48,6 +48,9 @@ template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, Con
 
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
+template <int NumQubits> void cpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset);
+template <int NumQubits> void cpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset);
+
 
 /*
  * SWAPS
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 9b8e819b5..0db7aa275 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -192,6 +192,71 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, gpu_statevec_packAmpsIntoBuffe
 
 
 
+/*
+ * FUSED SWAP BUFFER PACKING
+ *
+ * which reuse the existing packAmpsIntoBuffer and anyCtrlSwap_subB kernels (both
+ * already parameterised by an arbitrary buffer pointer offset), but direct them at
+ * an explicit staging buffer / receive offset to enable a single all-to-all round.
+ */
+
+
+template <int NumQubits>
+void gpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits);
+
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
+
+    qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
+
+    devints sortedQubits = getDevInts(util_getSorted(qubits));
+    qindex qubitStateMask = util_getBitMask(qubits, qubitStates);
+
+    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(sendBuf) + sendOffset, numThreads,
+        getPtr(sortedQubits), qubits.size(), qubitStateMask
+    );
+
+#else
+    error_gpuSimButGpuNotCompiled();
+#endif
+}
+
+
+template <int NumQubits>
+void gpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits);
+
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
+
+    qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
+
+    devints sortedQubits = getDevInts(util_getSorted(qubits));
+    qindex qubitStateMask = util_getBitMask(qubits, qubitStates);
+
+    // reuse subB kernel which performs amps[i] = buffer[n]; caller offsets the buffer
+    kernel_statevec_anyCtrlSwap_subB <NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvOffset, numThreads,
+        getPtr(sortedQubits), qubits.size(), qubitStateMask
+    );
+
+#else
+    error_gpuSimButGpuNotCompiled();
+#endif
+}
+
+
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_packAmpsForFusedSwap,   (Qureg, ConstList64, ConstList64, qcomp*, qindex) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_unpackAmpsForFusedSwap, (Qureg, ConstList64, ConstList64, qindex) )
+
+
+
 /*
  * SWAPS
  */
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index 029e0e871..8e3d88603 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -43,6 +43,9 @@ template <int NumQubits> qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, Con
 
 qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
+template <int NumQubits> void gpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset);
+template <int NumQubits> void gpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset);
+
 
 /*
  * SWAPS