diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 677e6c74a..7752b8557 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL // note qubits may incidentally be ctrls or targs; it doesn't matter GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() ); - + // return the number of packed amps, for caller convenience return func(qureg, qubits, qubitStates); } +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) { + + // inverse of packing; scatters received sub-buffer into strided local amps where + // the given qubits are in the given states (used by the fused multi-SWAP routine). + // only the CPU path is dispatched; the fused routine restricts itself to non-GPU + // quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel + // is needed and the GPU build is left untouched + if (qubitStates.empty()) + error_noCtrlsGivenToBufferPacker(); + + GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() ); + func(qureg, qubits, qubitStates); +} + + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) { return (qureg.isGpuAccelerated)? diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp index 5a8dc37fb..879ad8d78 100644 --- a/quest/src/core/accelerator.hpp +++ b/quest/src/core/accelerator.hpp @@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); +void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates); + qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index 83a23b921..ab11f6215 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons // the SWAPs act on unique qubit pairs and so commute. /// @todo - /// - the sequence of pair-wise full-swaps should be more efficient as a - /// "single" sequence of smaller messages sending amps directly to their - /// final destination node. This could use a new "multiSwap" function. - /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the - /// multiSwap function should use custatevecSwapIndexBits() if local, - /// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, + /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this + /// routine could use custatevecSwapIndexBits() if local, or + /// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, /// although the latter requires substantially more work like setting up /// a communicator which may be inelegant alongside our own distribution scheme. - // perform necessary swaps to move all targets into suffix, each of which invokes communication + // collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit + auto suffixTargs = lists_getEmptyList64(); + auto prefixTargs = lists_getEmptyList64(); for (size_t i=0; isuffix SWAP in turn (which + // wastefully relays an amplitude through intermediate nodes before its final node), + // we send each amplitude directly to its destination node in a single pass. The + // numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude + // of this node moves to the rank obtained by overwriting each prefix-target rank-bit + // with the value of its partnered suffix-target bit. We enumerate the (up to) + // 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose + // partnered suffix bit disagrees with this node's rank bit) and, for each, pack + + // exchange + unpack only the amplitudes bound there. The move is an involution + // between paired nodes, so the packed and unpacked amplitudes occupy the same local + // slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV. + + std::vector prefBits(numSwaps); + std::vector rankBits(numSwaps); + for (int i=0; i(flipBit(pairRank, prefBits[i])); + } - int suffixTarg = std::min(targsA[i], targsB[i]); - int prefixTarg = std::max(targsA[i], targsB[i]); - anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg); + // pack the amplitudes bound for pairRank, exchange, and scatter the received + // amplitudes back into those same local slots + qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states); + comm_exchangeSubBuffers(qureg, numPacked, pairRank); + accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states); } } diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 59df946e9..524e6d89d 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) ) +template +void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits); + + // this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received + // contiguous sub-buffer back into the strided local amplitudes where the given qubits + // are in the given states. It generalises anyCtrlSwap_subC to multiple constrained + // qubits, as needed by the fused multi-SWAP routine. + + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); -/* + // each constrained qubit halves the number of received amps + qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size()); + + // received amplitudes begin at the buffer's receive offset + qindex offset = getBufferRecvInd(); + + auto sortedQubitInds = util_getSorted(qubitInds); + auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); + + // use template param to compile-time unroll loop in insertBits() + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); + + #pragma omp parallel for if(qureg.isMultithreaded) + for (qindex n=0; n qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); +template void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates); + qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);