diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index c59334b55..be80d437a 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -17,6 +17,7 @@ #include "quest/src/core/autodeployer.hpp" #include "quest/src/core/validation.hpp" #include "quest/src/core/randomiser.hpp" +#include "quest/src/core/accelerator.hpp" #include "quest/src/comm/comm_config.hpp" #include "quest/src/cpu/cpu_config.hpp" #include "quest/src/gpu/gpu_config.hpp" @@ -459,6 +460,9 @@ void finalizeQuESTEnv() { // calling this will not automatically // free the memory of existing Quregs + // free the persistent fused-multi-swap staging workspace (host or device), if any + accel_clearFusedSwapSendCache(); + if (global_envPtr->isGpuAccelerated) gpu_clearCache(); // syncs first diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp index cf6956454..31a317118 100644 --- a/quest/src/comm/comm_routines.cpp +++ b/quest/src/comm/comm_routines.cpp @@ -242,6 +242,45 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) { } +void exchangeArraysWithMultiplePartners( + qcomp* send, qcomp* recv, + const vector& partnerRanks, + const vector& sendInds, const vector& recvInds, + qindex numAmpsPerBlock +) { +#if QUEST_COMPILE_MPI + + MPI_Comm mpiComm = comm_getMpiComm(); + + // each per-partner block is divided into power-of-2 messages; we issue ALL of every + // partner's asynchronous send/receives up-front (a personalised all-to-all) and wait + // once, so that a fused multi-swap completes in a single communication round + auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(numAmpsPerBlock); + + vector requests(2 * partnerRanks.size() * numMessages, MPI_REQUEST_NULL); + size_t r = 0; + + for (size_t b=0; b(m); + MPI_Irecv(&recv[recvInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]); + MPI_Isend(&send[sendInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]); + } + } + + // single wait completes the whole round (MPI will automatically free the request memory) + MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); + +#else + error_commButEnvNotDistributed(); +#endif +} + + /* * PRIVATE ASYNC SEND AND RECEIVE @@ -528,11 +567,50 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) { if (qureg.isGpuAccelerated) exchangeGpuSubBuffers(qureg, numAmps, pairRank); - else + else exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank); } +void comm_exchangeAmpsToBuffersForFusedSwap( + Qureg qureg, qcomp* sendBuf, + vector partnerRanks, vector blockSendInds, vector blockRecvInds, + qindex numAmpsPerBlock +) { + assert_commQuregIsDistributed(qureg); + for (int pairRank : partnerRanks) + assert_pairRankIsDistinct(qureg, pairRank); + + // 'sendBuf' is a contiguous staging buffer (in the qureg's memory space) holding one + // packed block per subcube partner; received blocks are written into the qureg's + // commBuffer. The total staged payload is < numAmpsPerNode (the self-block never moves). + qindex sendSpan = 0; + qindex recvSpan = 0; + for (size_t b=0; b partnerRanks, vector blockSendInds, vector blockRecvInds, qindex numAmpsPerBlock); + void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank); void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank); diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 677e6c74a..b5503ab81 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -246,6 +246,71 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int } +// persistent, lazily-grown staging workspace reused across fused multi-swaps, so we +// avoid a large (de)allocation (host malloc or cudaMalloc) on every call. This mirrors +// QuEST's existing 'gpuCache' for the dense-matrix kernel, and the persistent workspaces +// of cuStateVec / mpiQulacs. It is freed by accel_clearFusedSwapSendCache(), invoked at +// environment teardown (finalizeQuESTEnv). +static qcomp* fusedSwapSendCache = nullptr; +static qindex fusedSwapSendCacheLen = 0; +static bool fusedSwapSendCacheIsGpu = false; + + +qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps) { + + // discard a stale cache if the memory space (RAM vs VRAM) has changed + if (fusedSwapSendCache != nullptr && fusedSwapSendCacheIsGpu != qureg.isGpuAccelerated) + accel_clearFusedSwapSendCache(); + + // reuse the existing workspace when already large enough + if (numAmps <= fusedSwapSendCacheLen) + return fusedSwapSendCache; + + // otherwise grow it (freeing the old, smaller buffer first) + if (fusedSwapSendCache != nullptr) + (fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache); + + fusedSwapSendCacheIsGpu = qureg.isGpuAccelerated; + fusedSwapSendCache = (fusedSwapSendCacheIsGpu)? gpu_allocArray(numAmps) : cpu_allocArray(numAmps); + fusedSwapSendCacheLen = numAmps; + return fusedSwapSendCache; +} + + +void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer) { + + // no-op: the staging workspace persists and is reused by subsequent fused + // multi-swaps; it is released at environment teardown. Kept for call-site symmetry. + (void) qureg; + (void) buffer; +} + + +void accel_clearFusedSwapSendCache() { + + if (fusedSwapSendCache == nullptr) + return; + + (fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache); + fusedSwapSendCache = nullptr; + fusedSwapSendCacheLen = 0; +} + + +void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) { + + GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsForFusedSwap, qureg, qubits.size() ); + func(qureg, qubits, qubitStates, sendBuf, sendOffset); +} + + +void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) { + + GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_unpackAmpsForFusedSwap, qureg, qubits.size() ); + func(qureg, qubits, qubitStates, recvOffset); +} + + /* * SWAPS diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp index 5a8dc37fb..456592b42 100644 --- a/quest/src/core/accelerator.hpp +++ b/quest/src/core/accelerator.hpp @@ -173,6 +173,13 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); +qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps); +void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer); +void accel_clearFusedSwapSendCache(); + +void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset); +void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset); + /* * SWAPS diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index 83a23b921..ccfc641f2 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -34,6 +34,7 @@ #include #include #include +#include using std::vector; using std::tuple; @@ -893,6 +894,107 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64 */ +void fusedMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) { + + // we fuse 'numPairs' (>= 2) disjoint prefix<->suffix SWAPs into a single communication + // round. The 2^numPairs ranks differing from ours only in the swapped prefix qubits' + // rank-bits form a subcube; identifying a rank by the 'numPairs'-bit address of those + // rank-bits, the multi-swap relocates a local amp with suffix-target bit-pattern 'v' on + // rank-address 'a' to rank-address 'v' with new pattern 'a'. Hence the block with v==a + // stays put, and for each of the 2^numPairs - 1 partners we swap the block whose suffix + // pattern equals the partner's address. These index sets are disjoint, so all partner + // exchanges proceed concurrently. See arXiv:2311.01512, arXiv:quant-ph/0608239. + + int numPairs = prefixTargs.size(); + + // rank-bit positions of the prefix (global) targets, and our rank's bit values there + vector rankBitInds(numPairs); + vector myAddrBits(numPairs); + for (int i=0; i partnerRanks(numPartners); + vector blockInds(numPartners); + + // stage all partners' send-blocks contiguously in a temporary buffer (< one node's state) + qcomp* sendBuf = accel_allocFusedSwapSendBuffer(qureg, numPartners * numAmpsPerBlock); + + // pack each partner's block, identified by subset 'sub' of flipped rank-bits + for (int sub=1; sub<=numPartners; sub++) { + + int partner = sub - 1; + blockInds[partner] = partner * numAmpsPerBlock; + + int pairRank = qureg.rank; + List64 packStates = ctrlStates; + + for (int i=0; i> i) & 1; + if (flip) + pairRank = flipBit(pairRank, rankBitInds[i]); + + // partner's block holds local amps whose suffix-target[i] == partner's address bit i + packStates.push_back(myAddrBits[i] ^ flip); + } + + partnerRanks[partner] = pairRank; + accel_statevec_packAmpsForFusedSwap(qureg, packQubits, packStates, sendBuf, blockInds[partner]); + } + + // single all-to-all round: send each block to its partner, receive theirs into commBuffer + comm_exchangeAmpsToBuffersForFusedSwap(qureg, sendBuf, partnerRanks, blockInds, blockInds, numAmpsPerBlock); + + // scatter each received block back into the same local indices it was packed from + for (int sub=1; sub<=numPartners; sub++) { + + int partner = sub - 1; + List64 packStates = ctrlStates; + for (int i=0; i> i) & 1)); + + accel_statevec_unpackAmpsForFusedSwap(qureg, packQubits, packStates, blockInds[partner]); + } + + accel_deallocFusedSwapSendBuffer(qureg, sendBuf); +} + + +void multiSwapSequentially(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) { + + // reference (pre-fusion) behaviour: perform each disjoint prefix<->suffix SWAP one-at-a-time, + // wastefully relocating amplitudes once per swap. Retained for correctness comparison and + // benchmarking against the fused single-round path; never the default at runtime. + for (size_t i=0; i +void cpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits); + + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(sendBuf); + + // each constrained qubit (ctrl or swap-target) halves the needed iterations + qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size()); + + auto sortedQubits = util_getSorted(qubits); + auto qubitStateMask = util_getBitMask(qubits, qubitStates); + + // use template param to compile-time unroll loop in insertBits() + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); + + #pragma omp parallel for if(qureg.isMultithreaded) + for (qindex n=0; n +void cpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits); + + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + + // each constrained qubit (ctrl or swap-target) halves the needed iterations + qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size()); + + auto sortedQubits = util_getSorted(qubits); + auto qubitStateMask = util_getBitMask(qubits, qubitStates); + + // use template param to compile-time unroll loop in insertBits() + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); + + #pragma omp parallel for if(qureg.isMultithreaded) + for (qindex n=0; n qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, Con qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); +template void cpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset); +template void cpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset); + /* * SWAPS diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 9b8e819b5..0db7aa275 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -192,6 +192,71 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, gpu_statevec_packAmpsIntoBuffe +/* + * FUSED SWAP BUFFER PACKING + * + * which reuse the existing packAmpsIntoBuffer and anyCtrlSwap_subB kernels (both + * already parameterised by an arbitrary buffer pointer offset), but direct them at + * an explicit staging buffer / receive offset to enable a single all-to-all round. + */ + + +template +void gpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits); + +#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM + + qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size()); + int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); + + devints sortedQubits = getDevInts(util_getSorted(qubits)); + qindex qubitStateMask = util_getBitMask(qubits, qubitStates); + + kernel_statevec_packAmpsIntoBuffer <<>> ( + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(sendBuf) + sendOffset, numThreads, + getPtr(sortedQubits), qubits.size(), qubitStateMask + ); + +#else + error_gpuSimButGpuNotCompiled(); +#endif +} + + +template +void gpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) { + + assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits); + +#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM + + qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size()); + int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); + + devints sortedQubits = getDevInts(util_getSorted(qubits)); + qindex qubitStateMask = util_getBitMask(qubits, qubitStates); + + // reuse subB kernel which performs amps[i] = buffer[n]; caller offsets the buffer + kernel_statevec_anyCtrlSwap_subB <<>> ( + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvOffset, numThreads, + getPtr(sortedQubits), qubits.size(), qubitStateMask + ); + +#else + error_gpuSimButGpuNotCompiled(); +#endif +} + + +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_packAmpsForFusedSwap, (Qureg, ConstList64, ConstList64, qcomp*, qindex) ) +INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_unpackAmpsForFusedSwap, (Qureg, ConstList64, ConstList64, qindex) ) + + + /* * SWAPS */ diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp index 029e0e871..8e3d88603 100644 --- a/quest/src/gpu/gpu_subroutines.hpp +++ b/quest/src/gpu/gpu_subroutines.hpp @@ -43,6 +43,9 @@ template qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, Con qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); +template void gpu_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset); +template void gpu_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset); + /* * SWAPS