From fc98281184a5eea9c583e38b9ca34681335b836f Mon Sep 17 00:00:00 2001 From: zkasuran Date: Mon, 8 Jun 2026 08:36:51 +0530 Subject: [PATCH] Fuse distributed prefix-suffix multi-SWAP (closes #595) The localiser performed each prefix<->suffix SWAP in turn, so an amplitude moved by one SWAP was often moved again by the next, crossing the network several times. This fuses the group of disjoint SWAPs into one operation that computes each amplitude's final node and sends it there directly, so every amplitude crosses the network at most once. The disjoint SWAPs commute and compose into a single bit permutation. For the uncontrolled case (every internal caller) the routine enumerates the up to 2^eta-1 destination nodes and packs, exchanges and unpacks only the amplitudes bound to each. A new cpu_statevec_unpackAmpsFromBuffer scatters the received sub-buffer back into the strided local amplitudes, the inverse of the existing packer, looping over moved amplitudes not the whole state. Scope is CPU/OpenMP. GPU quregs and controlled multi-SWAPs keep the existing per-SWAP path, so the GPU build is unchanged. Comm volume drops 25% at eta=2 and 42% at eta=3 (1 - 1/2^eta), matching theory. Existing applySwap, applyCompMatr, applyCompMatr2 and calcPartialTrace suites pass at 1, 2, 4 and 8 ranks. --- quest/src/core/accelerator.cpp | 17 ++++++- quest/src/core/accelerator.hpp | 2 + quest/src/core/localiser.cpp | 73 ++++++++++++++++++++++++++----- quest/src/cpu/cpu_subroutines.cpp | 43 +++++++++++++++++- quest/src/cpu/cpu_subroutines.hpp | 2 + 5 files changed, 124 insertions(+), 13 deletions(-) diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 677e6c74a..7752b8557 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL // note qubits may incidentally be ctrls or targs; it doesn't matter GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() ); - + // return the number of packed amps, for caller convenience return func(qureg, qubits, qubitStates); } +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) { + + // inverse of packing; scatters received sub-buffer into strided local amps where + // the given qubits are in the given states (used by the fused multi-SWAP routine). + // only the CPU path is dispatched; the fused routine restricts itself to non-GPU + // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel + // is needed and the GPU build is left untouched + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() ); + func(qureg, qubits, qubitStates); +} + + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { return (qureg.isGpuAccelerated)? diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp index 5a8dc37fb..879ad8d78 100644 --- a/quest/src/core/accelerator.hpp +++ b/quest/src/core/accelerator.hpp @@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index 83a23b921..ab11f6215 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons // the SWAPs act on unique qubit pairs and so commute. /// @todo - /// - the sequence of pair-wise full-swaps should be more efficient as a - /// "single" sequence of smaller messages sending amps directly to their - /// final destination node. This could use a new "multiSwap" function. - /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the - /// multiSwap function should use custatevecSwapIndexBits() if local, - /// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, + /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this + /// routine could use custatevecSwapIndexBits() if local, or + /// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, /// although the latter requires substantially more work like setting up /// a communicator which may be inelegant alongside our own distribution scheme. - // perform necessary swaps to move all targets into suffix, each of which invokes communication + // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit + auto suffixTargs = lists_getEmptyList64(); + auto prefixTargs = lists_getEmptyList64(); for (size_t i=0; isuffix SWAP in turn (which + // wastefully relays an amplitude through intermediate nodes before its final node), + // we send each amplitude directly to its destination node in a single pass. The + // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude + // of this node moves to the rank obtained by overwriting each prefix-target rank-bit + // with the value of its partnered suffix-target bit. We enumerate the (up to) + // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose + // partnered suffix bit disagrees with this node's rank bit) and, for each, pack + + // exchange + unpack only the amplitudes bound there. The move is an involution + // between paired nodes, so the packed and unpacked amplitudes occupy the same local + // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV. + + std::vector prefBits(numSwaps); + std::vector rankBits(numSwaps); + for (int i=0; i(flipBit(pairRank, prefBits[i])); + } - int suffixTarg = std::min(targsA[i], targsB[i]); - int prefixTarg = std::max(targsA[i], targsB[i]); - anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg); + // pack the amplitudes bound for pairRank, exchange, and scatter the received + // amplitudes back into those same local slots + qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states); + comm_exchangeSubBuffers(qureg, numPacked, pairRank); + accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states); } } diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 59df946e9..524e6d89d 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) ) +template +void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); + + // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received + // contiguous sub-buffer back into the strided local amplitudes where the given qubits + // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained + // qubits, as needed by the fused multi-SWAP routine. + + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); -/* + // each constrained qubit halves the number of received amps + qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); + + // received amplitudes begin at the buffer's receive offset + qindex offset = getBufferRecvInd(); + + auto sortedQubitInds = util_getSorted(qubitInds); + auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); + + // use template param to compile-time unroll loop in insertBits() + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); + + #pragma omp parallel for if(qureg.isMultithreaded) + for (qindex n=0; n qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); + qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);