diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 677e6c74a..7752b8557 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL
 
     // note qubits may incidentally be ctrls or targs; it doesn't matter
     GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );
-    
+
     // return the number of packed amps, for caller convenience
     return func(qureg, qubits, qubitStates);
 }
 
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {
+
+    // inverse of packing; scatters received sub-buffer into strided local amps where
+    // the given qubits are in the given states (used by the fused multi-SWAP routine).
+    // only the CPU path is dispatched; the fused routine restricts itself to non-GPU
+    // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel
+    // is needed and the GPU build is left untouched
+    if (qubitStates.empty())
+        error_noCtrlsGivenToBufferPacker();
+
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() );
+    func(qureg, qubits, qubitStates);
+}
+
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {
 
     return (qureg.isGpuAccelerated)?
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 5a8dc37fb..879ad8d78 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
 
 qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
 
+void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
+
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
 
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 83a23b921..ab11f6215 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
     // the SWAPs act on unique qubit pairs and so commute.
 
     /// @todo
-    ///   - the sequence of pair-wise full-swaps should be more efficient as a
-    ///     "single" sequence of smaller messages sending amps directly to their
-    ///     final destination node. This could use a new "multiSwap" function.
-    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
-    ///     multiSwap function should use custatevecSwapIndexBits() if local,
-    ///     or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
+    ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this
+    ///     routine could use custatevecSwapIndexBits() if local, or
+    ///     custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
     ///     although the latter requires substantially more work like setting up
     ///     a communicator which may be inelegant alongside our own distribution scheme.
 
-    // perform necessary swaps to move all targets into suffix, each of which invokes communication
+    // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit
+    auto suffixTargs = lists_getEmptyList64();
+    auto prefixTargs = lists_getEmptyList64();
     for (size_t i=0; i<targsA.size(); i++) {
-
         if (targsA[i] == targsB[i])
             continue;
+        suffixTargs.push_back(std::min(targsA[i], targsB[i]));
+        prefixTargs.push_back(std::max(targsA[i], targsB[i]));
+    }
+    int numSwaps = suffixTargs.size();
+    if (numSwaps == 0)
+        return;
+
+    // the fused routine below targets the uncontrolled, non-GPU case which every internal
+    // caller currently uses. A controlled multi-SWAP, or a GPU-accelerated Qureg, falls back
+    // to the per-swap routine (issue #595 notes the OpenMP logic alone is sufficient, so the
+    // GPU path is left unchanged)
+    if (!ctrls.empty() || qureg.isGpuAccelerated) {
+        for (int i=0; i<numSwaps; i++)
+            anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
+        return;
+    }
+
+    // FUSED multi-SWAP: rather than performing each prefix<->suffix SWAP in turn (which
+    // wastefully relays an amplitude through intermediate nodes before its final node),
+    // we send each amplitude directly to its destination node in a single pass. The
+    // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude
+    // of this node moves to the rank obtained by overwriting each prefix-target rank-bit
+    // with the value of its partnered suffix-target bit. We enumerate the (up to)
+    // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose
+    // partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
+    // exchange + unpack only the amplitudes bound there. The move is an involution
+    // between paired nodes, so the packed and unpacked amplitudes occupy the same local
+    // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV.
+
+    std::vector<int> prefBits(numSwaps);
+    std::vector<int> rankBits(numSwaps);
+    for (int i=0; i<numSwaps; i++) {
+        prefBits[i] = util_getPrefixInd(prefixTargs[i], qureg);
+        rankBits[i] = getBit(qureg.rank, prefBits[i]);
+    }
+
+    // subset 0 are the amplitudes that do not move (all suffix bits already match the
+    // rank bits), so we skip it and iterate only the communicating subsets
+    qindex numSubsets = powerOf2(numSwaps);
+    for (qindex sub=1; sub<numSubsets; sub++) {
+
+        // the destination node flips this node's rank bits for the targeted subset, and
+        // the to-be-sent amplitudes are those whose suffix-target bits match the pattern
+        auto states = lists_getEmptyList64();
+        int pairRank = qureg.rank;
+        for (int i=0; i<numSwaps; i++) {
+            int inSubset = getBit(sub, i);
+            states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
+            if (inSubset)
+                pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
+        }
 
-        int suffixTarg = std::min(targsA[i], targsB[i]);
-        int prefixTarg = std::max(targsA[i], targsB[i]);
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+        // pack the amplitudes bound for pairRank, exchange, and scatter the received
+        // amplitudes back into those same local slots
+        qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states);
+        comm_exchangeSubBuffers(qureg, numPacked, pairRank);
+        accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states);
     }
 }
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 59df946e9..524e6d89d 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
 
 
+template <int NumQubits>
+void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
+
+    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
+
+    // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received
+    // contiguous sub-buffer back into the strided local amplitudes where the given qubits
+    // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained
+    // qubits, as needed by the fused multi-SWAP routine.
+
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
 
-/* 
+    // each constrained qubit halves the number of received amps
+    qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());
+
+    // received amplitudes begin at the buffer's receive offset
+    qindex offset = getBufferRecvInd();
+
+    auto sortedQubitInds = util_getSorted(qubitInds);
+    auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
+
+    // use template param to compile-time unroll loop in insertBits()
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
+
+    #pragma omp parallel for if(qureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // i = nth local index where qubits are in the specified states
+        qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);
+
+        // scatter the contiguous sub-buffer among the strided local amplitudes
+        amps[i] = buffer[offset + n];
+    }
+}
+
+
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )
+
+
+
+/*
  * SWAPS
  */
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 3dbae057b..5925ad5a4 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -46,6 +46,8 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
 
 template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
 
+template <int NumQubits> void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
+
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);