diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp index cf6956454..189b79807 100644 --- a/quest/src/comm/comm_routines.cpp +++ b/quest/src/comm/comm_routines.cpp @@ -18,10 +18,14 @@ #include "quest/src/core/errors.hpp" #include "quest/src/core/bitwise.hpp" #include "quest/src/cpu/cpu_config.hpp" +#include "quest/src/cpu/cpu_subroutines.hpp" #include "quest/src/gpu/gpu_config.hpp" #include "quest/src/comm/comm_config.hpp" #include "quest/src/comm/comm_indices.hpp" +#include +#include + #if QUEST_COMPILE_MPI #include extern MPI_Comm comm_getMpiComm(); // comm_config.cpp does not leak MPI_Comm @@ -827,3 +831,52 @@ vector comm_gatherStringsToRoot(char* localChars, int maxNumLocalChars) return {}; #endif } + +void comm_exchangeFusedMultiSwap(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, const std::map& swapMap) { + assert_commQuregIsDistributed(qureg); + +#if QUEST_COMPILE_MPI + int k = swapMap.size(); + if (k == 0) return; + + // GPU fallback: sync GPU amps to CPU, perform fused swap on CPU, sync back + if (qureg.isGpuAccelerated) + syncQuregFromGpu(qureg); + + qindex chunkSize = qureg.numAmpsPerNode >> k; + + std::vector prefixTargs; + for (auto const& [s, p] : swapMap) { + prefixTargs.push_back(p); + } + + int myPrefixBits = 0; + for (int i = 0; i < k; i++) { + if (util_getRankBitOfQubit(prefixTargs[i], qureg)) { + myPrefixBits |= (1 << i); + } + } + + qcomp* sendBuffer = qureg.cpuCommBuffer; + qcomp* recvBuffer = qureg.cpuCommBuffer + chunkSize; + + for (int s = 1; s < (1 << k); s++) { + int target_m = myPrefixBits ^ s; + int pairRank = qureg.rank; + for (int i = 0; i < k; i++) { + if ((s >> i) & 1) { + pairRank = flipBit(pairRank, util_getPrefixInd(prefixTargs[i], qureg)); + } + } + + cpu_statevec_packFusedMultiSwapBuffers(qureg, swapMap, target_m, sendBuffer); + exchangeArrays(sendBuffer, recvBuffer, chunkSize, pairRank); + cpu_statevec_unpackFusedMultiSwapBuffers(qureg, swapMap, target_m, recvBuffer); + } + + if (qureg.isGpuAccelerated) + syncQuregToGpu(qureg); +#else + error_commButEnvNotDistributed(); +#endif +} diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp index e75e889f6..ad669ff60 100644 --- a/quest/src/comm/comm_routines.hpp +++ b/quest/src/comm/comm_routines.hpp @@ -2,34 +2,36 @@ * Signatures for communicating and exchanging amplitudes between compute * nodes, when running in distributed mode, using the C MPI standard. * Calling these functions when QUEST_COMPILE_MPI=0, or when the passed Quregs - * are not distributed, will throw a runtime internal error. - * + * are not distributed, will throw a runtime internal error. + * * @author Tyson Jones */ #ifndef COMM_ROUTINES_HPP #define COMM_ROUTINES_HPP -#include "quest/include/types.h" -#include "quest/include/qureg.h" #include "quest/include/matrices.h" +#include "quest/include/qureg.h" +#include "quest/include/types.h" +#include "quest/src/core/utilities.hpp" -#include +#include #include +#include using std::vector; - - /* * STATE EXCHANGE METHODS */ -void comm_exchangeAmpsToBuffers(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps, int pairRank); +void comm_exchangeAmpsToBuffers(Qureg qureg, qindex sendInd, qindex recvInd, + qindex numAmps, int pairRank); void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank); -void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank); +void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, + int pairRank); void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank); @@ -39,46 +41,46 @@ void comm_combineAmpsIntoBuffer(Qureg receiver, Qureg sender); void comm_combineElemsIntoBuffer(Qureg receiver, FullStateDiagMatr sender); - +void comm_exchangeFusedMultiSwap(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + const std::map &swapMap); /* * MISC COMMUNICATION METHODS */ -void comm_broadcastAmp(int sendRank, qcomp* sendAmp); - -void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps); +void comm_broadcastAmp(int sendRank, qcomp *sendAmp); -void comm_broadcastIntsFromRoot(int* arr, qindex length); +void comm_sendAmpsToRoot(int sendRank, qcomp *send, qcomp *recv, + qindex numAmps); -void comm_broadcastUnsignedsFromRoot(unsigned* arr, qindex length); - -void comm_combineSubArrays(qcomp* recv, vector globalRecvInds, vector localSendInds, vector numAmpsPerRank); +void comm_broadcastIntsFromRoot(int *arr, qindex length); +void comm_broadcastUnsignedsFromRoot(unsigned *arr, qindex length); +void comm_combineSubArrays(qcomp *recv, vector globalRecvInds, + vector localSendInds, + vector numAmpsPerRank); /* * REDUCTION METHODS */ -void comm_reduceAmp(qcomp* localAmp); +void comm_reduceAmp(qcomp *localAmp); -void comm_reduceReal(qreal* localReal); +void comm_reduceReal(qreal *localReal); -void comm_reduceReals(qreal* localReals, qindex numLocalReals); +void comm_reduceReals(qreal *localReals, qindex numLocalReals); bool comm_isTrueOnAllNodes(bool val); bool comm_isTrueOnRootNode(bool val); - - /* * GATHER METHODS */ -vector comm_gatherStringsToRoot(char* localChars, int maxNumLocalChars); - - +vector comm_gatherStringsToRoot(char *localChars, + int maxNumLocalChars); #endif // COMM_ROUTINES_HPP \ No newline at end of file diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp index 83a23b921..b71e0c57d 100644 --- a/quest/src/core/localiser.cpp +++ b/quest/src/core/localiser.cpp @@ -2,1074 +2,1152 @@ * Internal functions which localize the data needed for simulation. * That is, they determine whether performing a simulation requires * Qureg amplitudes from other distributed nodes and if so, invoke - * the necessary communication, before finally calling the + * the necessary communication, before finally calling the * embarrassingly parallel subroutines in accelerator.cpp. This is * done agnostically of whether amplitudes of the Qureg are being * stored in RAM (CPU) or VRAM (GPU). The bespoke per-operator logic * herein is what makes QuEST v4 truly unique among simulators! - * + * * @author Tyson Jones */ -#include "quest/include/qureg.h" -#include "quest/include/paulis.h" -#include "quest/include/matrices.h" #include "quest/include/initialisations.h" +#include "quest/include/matrices.h" +#include "quest/include/paulis.h" +#include "quest/include/qureg.h" -#include "quest/src/core/errors.hpp" +#include "quest/src/comm/comm_config.hpp" +#include "quest/src/comm/comm_routines.hpp" +#include "quest/src/core/accelerator.hpp" #include "quest/src/core/bitwise.hpp" +#include "quest/src/core/errors.hpp" #include "quest/src/core/lists.hpp" -#include "quest/src/core/utilities.hpp" -#include "quest/src/core/paulilogic.hpp" #include "quest/src/core/localiser.hpp" -#include "quest/src/core/accelerator.hpp" -#include "quest/src/comm/comm_config.hpp" -#include "quest/src/comm/comm_routines.hpp" +#include "quest/src/core/paulilogic.hpp" +#include "quest/src/core/utilities.hpp" #include "quest/src/cpu/cpu_config.hpp" #include "quest/src/gpu/gpu_config.hpp" -#include +#include #include -#include #include -#include +#include #include +#include -using std::vector; using std::tuple; - - +using std::vector; /* * PRIVATE FUNCTIONS */ - bool doesGateRequireComm(Qureg qureg, ConstList64 targs) { - // non-distributed quregs never communicate (duh) - if (!qureg.isDistributed) - return false; + // non-distributed quregs never communicate (duh) + if (!qureg.isDistributed) + return false; - // communication necessary when any prefix qubit is targeted - return ! util_areAllQubitsInSuffix(targs, qureg); + // communication necessary when any prefix qubit is targeted + return !util_areAllQubitsInSuffix(targs, qureg); } bool doesGateRequireComm(Qureg qureg, int targ) { - return doesGateRequireComm(qureg, lists_getList64({targ})); + return doesGateRequireComm(qureg, lists_getList64({targ})); } - bool doesChannelRequireComm(Qureg qureg, ConstList64 ketQubits) { - if (!qureg.isDensityMatrix) - error_localiserPassedStateVecToChannelComCheck(); + if (!qureg.isDensityMatrix) + error_localiserPassedStateVecToChannelComCheck(); - // ket-qubits are gauranteed to be in the suffix (because we distributed >=1 column per node), - // so channels invoke communication if any corresponding bra-qubits are in prefix - auto braQubits = util_getBraQubits(ketQubits, qureg); - return doesGateRequireComm(qureg, braQubits); + // ket-qubits are gauranteed to be in the suffix (because we distributed >=1 + // column per node), so channels invoke communication if any corresponding + // bra-qubits are in prefix + auto braQubits = util_getBraQubits(ketQubits, qureg); + return doesGateRequireComm(qureg, braQubits); } bool doesChannelRequireComm(Qureg qureg, int ketQubit) { - return doesChannelRequireComm(qureg, lists_getList64({ketQubit})); + return doesChannelRequireComm(qureg, lists_getList64({ketQubit})); } +bool doAnyLocalStatesHaveQubitValues(Qureg qureg, ConstList64 qubits, + ConstList64 states) { -bool doAnyLocalStatesHaveQubitValues(Qureg qureg, ConstList64 qubits, ConstList64 states) { - - // this answers the generic question of "do any of the given qubits lie in the - // prefix substate with node-fixed values inconsistent with the given states?" + // this answers the generic question of "do any of the given qubits lie in the + // prefix substate with node-fixed values inconsistent with the given states?" - // non-distributed quregs always have amps satisfying ctrls - if (!qureg.isDistributed) - return true; + // non-distributed quregs always have amps satisfying ctrls + if (!qureg.isDistributed) + return true; - // check each ctrl qubit - for (size_t i=0; i getSuffixQubitsAndStates(Qureg qureg, ConstList64 qubits, + ConstList64 states) { -tuple getSuffixQubitsAndStates(Qureg qureg, ConstList64 qubits, ConstList64 states) { - - List64 suffixQubits = lists_getEmptyList64(); - List64 suffixStates = lists_getEmptyList64(); + List64 suffixQubits = lists_getEmptyList64(); + List64 suffixStates = lists_getEmptyList64(); - for (size_t i=0; i ctrlInds; - for (size_t i=0; i ctrlInds; + for (size_t i = 0; i < numCtrls; i++) + ctrlInds[outCtrls[i]] = i; - // check every target in arbitrary order, modifying outTargs and outCtrls as we go - for (size_t i=0; i=1 columns; it can fit! + if (densmatr.isDistributed) return spoof; -} - - -Qureg createSpoofedLocalStateVecFromDensMatr(Qureg densmatr, bool &memWasAlloc) { - assert_localiserGivenDensMatr(densmatr); - // this function spoofs a non-distributed statevector Qureg with the - // same number of qubits as the given densmatr. It uses densmatr's - // mutlithread and GPU status, and if they exist, re-uses densmatr's - // CPU and GPU communication buffers for its main memory. If densmatr - // has no communication buffers, temporary memory is created, which is - // acceptable since it is expected much smaller than densmatr's local - // memory (a factor densmatr.numColsPerNode). In that scenario, - // memWasAlloc is overwritten to be true. + // otherwise, we must create temporary memory for pure. Even if we need only + // create GPU memory, we create the matching CPU memory too for defensive + // design + memWasAlloc = true; + spoof.cpuAmps = cpu_allocArray(spoof.numAmps); + assert_localiserSuccessfullyAllocatedTempMemory(spoof.cpuAmps, false); // CPU - memWasAlloc = false; - Qureg spoof = getSpoofedLocalStateVecFromDistributedDensMatrBuffers(densmatr); + if (spoof.isGpuAccelerated) { + spoof.gpuAmps = gpu_allocArray(spoof.numAmps); + assert_localiserSuccessfullyAllocatedTempMemory(spoof.gpuAmps, true); // GPU + } - // if it exists, we repurpose densmatr's existing buffer space to store - // pure's statevector, since every node contains >=1 columns; it can fit! - if (densmatr.isDistributed) - return spoof; - - // otherwise, we must create temporary memory for pure. Even if we need only - // create GPU memory, we create the matching CPU memory too for defensive design - memWasAlloc = true; - spoof.cpuAmps = cpu_allocArray(spoof.numAmps); - assert_localiserSuccessfullyAllocatedTempMemory(spoof.cpuAmps, false); // CPU - - if (spoof.isGpuAccelerated) { - spoof.gpuAmps = gpu_allocArray(spoof.numAmps); - assert_localiserSuccessfullyAllocatedTempMemory(spoof.gpuAmps, true); // GPU - } - - return spoof; + return spoof; } - void freeSpoofedLocalStateVec(Qureg spoof, bool wasMemAlloc) { - if (!wasMemAlloc) - return; + if (!wasMemAlloc) + return; - cpu_deallocArray(spoof.cpuAmps); + cpu_deallocArray(spoof.cpuAmps); - if (spoof.isGpuAccelerated) - gpu_deallocArray(spoof.gpuAmps); + if (spoof.isGpuAccelerated) + gpu_deallocArray(spoof.gpuAmps); } - - /* * COMMUNICATION WRAPPERS */ +void exchangeAmpsToBuffersWhereQubitsAreInStates(Qureg qureg, int pairRank, + ConstList64 qubits, + ConstList64 states) { -void exchangeAmpsToBuffersWhereQubitsAreInStates(Qureg qureg, int pairRank, ConstList64 qubits, ConstList64 states) { - - // when there are no constraining qubits, all amps are exchanged; there is no need to pack the buffer. - // this is typically triggered when a communicating localiser function is given no control qubits - if (qubits.empty()) { - comm_exchangeAmpsToBuffers(qureg, pairRank); - return; - } + // when there are no constraining qubits, all amps are exchanged; there is no + // need to pack the buffer. this is typically triggered when a communicating + // localiser function is given no control qubits + if (qubits.empty()) { + comm_exchangeAmpsToBuffers(qureg, pairRank); + return; + } - // otherwise, we pack and exchange only to-be-communicated amps between sub-buffers - qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, qubits, states); - comm_exchangeSubBuffers(qureg, numPacked, pairRank); + // otherwise, we pack and exchange only to-be-communicated amps between + // sub-buffers + qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, qubits, states); + comm_exchangeSubBuffers(qureg, numPacked, pairRank); } - - /* * GETTERS */ - qcomp localiser_statevec_getAmp(Qureg qureg, qindex globalInd) { - // this custom routine exists, in lieu of merely invoking - // getAmps() below, because it avoids the superfluous rank - // enumeration and is therefore locally faster; there - // may be applications where this is desirable (maybe) - - // when qureg not distributed (although env may be), every - // node returns their identical local amp - if (!qureg.isDistributed) { - qcomp amp; - accel_statevec_getAmps_sub(&, qureg, globalInd, 1); - return amp; - } - - // otherwise one node contains the target amp - qcomp amp = 0; - int sender = util_getRankContainingIndex(qureg, globalInd); - if (sender == qureg.rank) { - qindex localInd = util_getLocalIndexOfGlobalIndex(qureg, globalInd); - accel_statevec_getAmps_sub(&, qureg, localInd, 1); - } + // this custom routine exists, in lieu of merely invoking + // getAmps() below, because it avoids the superfluous rank + // enumeration and is therefore locally faster; there + // may be applications where this is desirable (maybe) - // which it shares with all other nodes - comm_broadcastAmp(sender, &); + // when qureg not distributed (although env may be), every + // node returns their identical local amp + if (!qureg.isDistributed) { + qcomp amp; + accel_statevec_getAmps_sub(&, qureg, globalInd, 1); return amp; + } + + // otherwise one node contains the target amp + qcomp amp = 0; + int sender = util_getRankContainingIndex(qureg, globalInd); + if (sender == qureg.rank) { + qindex localInd = util_getLocalIndexOfGlobalIndex(qureg, globalInd); + accel_statevec_getAmps_sub(&, qureg, localInd, 1); + } + + // which it shares with all other nodes + comm_broadcastAmp(sender, &); + return amp; +} + +void localiser_statevec_getAmps(qcomp *outAmps, Qureg qureg, + qindex globalStartInd, qindex globalNumAmps) { + + // we do not assert state-vec, since the density matrix routine re-uses this + // function + + // when not distributed, all nodes merely perform direct local overwrite and + // finish + if (!qureg.isDistributed) { + accel_statevec_getAmps_sub(outAmps, qureg, globalStartInd, globalNumAmps); + return; + } + + // when distributed, each node will broadcast their overlap (which may be + // zero) with the global range + int myRank = comm_getRank(); + int numNodes = comm_getNumNodes(); + + // which they first overwrite into their local copies of out (may involve a + // GPU-to-CPU copy) + if (util_areAnyVectorElemsWithinNode(myRank, qureg.numAmpsPerNode, + globalStartInd, globalNumAmps)) { + auto localInds = util_getLocalIndRangeOfVectorElemsWithinNode( + myRank, qureg.numAmpsPerNode, globalStartInd, globalNumAmps); + accel_statevec_getAmps_sub(&outAmps[localInds.localDuplicStartInd], qureg, + localInds.localDistribStartInd, + localInds.numElems); + } + + // determine the overlap with each node (i.e. which amps, if any, they + // contribute) + vector globalRecvInds(numNodes); + vector localSendInds(numNodes); + vector numAmpsPerRank(numNodes, 0); // default = zero contributed amps + + for (int sendRank = 0; sendRank < numNodes; sendRank++) { + + if (!util_areAnyVectorElemsWithinNode(sendRank, qureg.numAmpsPerNode, + globalStartInd, globalNumAmps)) + continue; + + auto inds = util_getLocalIndRangeOfVectorElemsWithinNode( + sendRank, qureg.numAmpsPerNode, globalStartInd, globalNumAmps); + globalRecvInds[sendRank] = inds.localDuplicStartInd; + localSendInds[sendRank] = inds.localDistribStartInd; + numAmpsPerRank[sendRank] = inds.numElems; + } + + // contributor nodes broadcast, all nodes receive, so that every node + // populates 'outAmps' fully + comm_combineSubArrays(outAmps, globalRecvInds, localSendInds, numAmpsPerRank); +} + +void localiser_densmatr_getAmps(qcomp **outAmps, Qureg qureg, qindex startRow, + qindex startCol, qindex numRows, + qindex numCols) { + assert_localiserGivenDensMatr(qureg); + + /// @todo improve the performance! + /// this function simply serially invokes localiser_statevec_getAmps() upon + /// every indicated column, for simplicity, and since we believe this function + /// will only ever be called upon tractably small sub-matrices. After all, the + /// user is likely to serially process outAmps themselves. Our method incurs + /// the below insignificant performance penalties: + /// - we must allocate temporary memory that is the same size as outAmps, + /// but transposed, because we cannot make a pointer to a column of outAmps + /// in order to invoke localiser_statevec_getAmps() thereupon. Thereafter, + /// we serially populate outAmps with the transposed temp memory. + /// - every invocation of localiser_statevec_getAmps() invokes synchronous + /// GPU-CPU copying (a total of #numCols), whereas a bespoke implementation + /// could perform each non-contiguous copy asynchronously then wait + /// - every invocation invokes synchronous MPI broadcasting, whereas a bespoke + /// method could asynch all per-col broadcasts before a final wait + /// A custom function to remedy these issues is complicated; it would involve + /// e.g. exposing MPI_Request outside of comm_routines.cpp (unacceptable for + /// compiler compatibility), or having comm_routines cache un-fulfilled asynch + /// requests, etc. + + vector> tempOut; + util_tryAllocMatrix( + tempOut, numCols, numRows, + error_localiserFailedToAllocTempMemory); // transposed dim of outAmps + + for (qindex c = 0; c < numCols; c++) { + qindex flatInd = util_getGlobalFlatIndex(qureg, startRow, startCol + c); + localiser_statevec_getAmps(tempOut[c].data(), qureg, flatInd, numRows); + } + + // serially overwrite outAmps = transpose(tempOut) + for (qindex r = 0; r < numRows; r++) + for (qindex c = 0; c < numCols; c++) + outAmps[r][c] = tempOut[c][r]; // writes contiguous, reads strided +} + +void localiser_fullstatediagmatr_getElems(qcomp *outElems, + FullStateDiagMatr matr, + qindex globalStartInd, + qindex globalNumElems) { + + // re-use identical logic of statevector setter + Qureg qureg = getSpoofedBufferlessQuregFromFullStateDiagMatr(matr); + localiser_statevec_getAmps(outElems, qureg, globalStartInd, globalNumElems); } - -void localiser_statevec_getAmps(qcomp* outAmps, Qureg qureg, qindex globalStartInd, qindex globalNumAmps) { - - // we do not assert state-vec, since the density matrix routine re-uses this function - - // when not distributed, all nodes merely perform direct local overwrite and finish - if (!qureg.isDistributed) { - accel_statevec_getAmps_sub(outAmps, qureg, globalStartInd, globalNumAmps); - return; - } - - // when distributed, each node will broadcast their overlap (which may be zero) with the global range - int myRank = comm_getRank(); - int numNodes = comm_getNumNodes(); - - // which they first overwrite into their local copies of out (may involve a GPU-to-CPU copy) - if (util_areAnyVectorElemsWithinNode(myRank, qureg.numAmpsPerNode, globalStartInd, globalNumAmps)) { - auto localInds = util_getLocalIndRangeOfVectorElemsWithinNode(myRank, qureg.numAmpsPerNode, globalStartInd, globalNumAmps); - accel_statevec_getAmps_sub(&outAmps[localInds.localDuplicStartInd], qureg, localInds.localDistribStartInd, localInds.numElems); - } - - // determine the overlap with each node (i.e. which amps, if any, they contribute) - vector globalRecvInds(numNodes); - vector localSendInds (numNodes); - vector numAmpsPerRank(numNodes, 0); // default = zero contributed amps - - for (int sendRank=0; sendRank> tempOut; - util_tryAllocMatrix(tempOut, numCols, numRows, error_localiserFailedToAllocTempMemory); // transposed dim of outAmps - - for (qindex c=0; c> tempAmps; - util_tryAllocMatrix(tempAmps, numCols, numRows, error_localiserFailedToAllocTempMemory); // transpose of inAmps + vector> tempAmps; + util_tryAllocMatrix( + tempAmps, numCols, numRows, + error_localiserFailedToAllocTempMemory); // transpose of inAmps - // serially overwrite tempAmps = transpose(inAmps) - for (qindex c=0; c targ1 - if (targ1 > targ2) - std::swap(targ1, targ2); + // ensure targ2 > targ1 + if (targ1 > targ2) + std::swap(targ1, targ2); - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) - return; + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) + return; - // retain only suffix control qubits as relevant to communication and local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); + // retain only suffix control qubits as relevant to communication and local + // amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); - // determine necessary communication - bool comm1 = doesGateRequireComm(qureg, targ1); - bool comm2 = doesGateRequireComm(qureg, targ2); + // determine necessary communication + bool comm1 = doesGateRequireComm(qureg, targ1); + bool comm2 = doesGateRequireComm(qureg, targ2); - if (comm2 && comm1) - anyCtrlSwapBetweenPrefixAndPrefix(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2); - if (comm2 && !comm1) - anyCtrlSwapBetweenPrefixAndSuffix(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2); - if (!comm2 && !comm1) - accel_statevec_anyCtrlSwap_subA(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2); + if (comm2 && comm1) + anyCtrlSwapBetweenPrefixAndPrefix(qureg, suffixCtrls, suffixCtrlStates, + targ1, targ2); + if (comm2 && !comm1) + anyCtrlSwapBetweenPrefixAndSuffix(qureg, suffixCtrls, suffixCtrlStates, + targ1, targ2); + if (!comm2 && !comm1) + accel_statevec_anyCtrlSwap_subA(qureg, suffixCtrls, suffixCtrlStates, targ1, + targ2); } - - /* * MULTI-SWAP */ - -void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targsA, ConstList64 targsB) { - - // this is an internal function called by the below routines which require - // performing a sequence of SWAPs to reorder qubits, or move them into suffix. - // the SWAPs act on unique qubit pairs and so commute. - - /// @todo - /// - the sequence of pair-wise full-swaps should be more efficient as a - /// "single" sequence of smaller messages sending amps directly to their - /// final destination node. This could use a new "multiSwap" function. - /// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the - /// multiSwap function should use custatevecSwapIndexBits() if local, - /// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed, - /// although the latter requires substantially more work like setting up - /// a communicator which may be inelegant alongside our own distribution scheme. - - // perform necessary swaps to move all targets into suffix, each of which invokes communication - for (size_t i=0; i swapMap; + for (size_t i = 0; i < targsA.size(); i++) { + if (targsA[i] != targsB[i]) { + swapMap[std::min(targsA[i], targsB[i])] = std::max(targsA[i], targsB[i]); } + } + if (swapMap.empty()) { + return; + } + comm_exchangeFusedMultiSwap(qureg, ctrls, ctrlStates, swapMap); } - - /* * ONE-TARGET DENSE MATRIX */ +void anyCtrlOneTargDenseMatrOnPrefix(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, int targ, + CompMatr1 matr) { -void anyCtrlOneTargDenseMatrOnPrefix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr) { - - int pairRank = util_getRankWithQubitFlipped(targ, qureg); - exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, ctrls, ctrlStates); + int pairRank = util_getRankWithQubitFlipped(targ, qureg); + exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, ctrls, + ctrlStates); - // extract relevant gate elements - int bit = util_getRankBitOfQubit(targ, qureg); - qcomp fac0 = matr.elems[bit][ bit]; - qcomp fac1 = matr.elems[bit][!bit]; + // extract relevant gate elements + int bit = util_getRankBitOfQubit(targ, qureg); + qcomp fac0 = matr.elems[bit][bit]; + qcomp fac1 = matr.elems[bit][!bit]; - // update local amps using received amps in buffer - accel_statevec_anyCtrlOneTargDenseMatr_subB(qureg, ctrls, ctrlStates, fac0, fac1); + // update local amps using received amps in buffer + accel_statevec_anyCtrlOneTargDenseMatr_subB(qureg, ctrls, ctrlStates, fac0, + fac1); } +void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + int targ, CompMatr1 matr, + bool conj, bool transp) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); -void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) - return; + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) + return; - // retain only suffix control qubits as relevant to communication and local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); + // retain only suffix control qubits as relevant to communication and local + // amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); - // only one of conj or transp will be true (but logic is correct if both were true) - if (conj) - matr = util_getConj(matr); - if (transp) - matr = util_getTranspose(matr); + // only one of conj or transp will be true (but logic is correct if both were + // true) + if (conj) + matr = util_getConj(matr); + if (transp) + matr = util_getTranspose(matr); - // perform embarrassingly parallel routine or communication-inducing swaps - doesGateRequireComm(qureg, targ)? - anyCtrlOneTargDenseMatrOnPrefix(qureg, suffixCtrls, suffixCtrlStates, targ, matr) : - accel_statevec_anyCtrlOneTargDenseMatr_subA(qureg, suffixCtrls, suffixCtrlStates, targ, matr); + // perform embarrassingly parallel routine or communication-inducing swaps + doesGateRequireComm(qureg, targ) + ? anyCtrlOneTargDenseMatrOnPrefix(qureg, suffixCtrls, suffixCtrlStates, + targ, matr) + : accel_statevec_anyCtrlOneTargDenseMatr_subA( + qureg, suffixCtrls, suffixCtrlStates, targ, matr); } - - /* * TWO-TARGET & ANY-TARGET DENSE MATRIX * - * which are intermixed, despite each having their own local backend + * which are intermixed, despite each having their own local backend * implementations, because they use identical communication logic */ - -void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr2 matr, bool conj, bool transp) { - if (conj) - matr = util_getConj(matr); - if (transp) - matr = util_getTranspose(matr); - accel_statevec_anyCtrlTwoTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs[0], targs[1], matr); +void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + ConstList64 targs, CompMatr2 matr, + bool conj, bool transp) { + if (conj) + matr = util_getConj(matr); + if (transp) + matr = util_getTranspose(matr); + accel_statevec_anyCtrlTwoTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs[0], + targs[1], matr); +} +void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + ConstList64 targs, CompMatr matr, + bool conj, bool transp) { + accel_statevec_anyCtrlAnyTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs, + matr, conj, transp); } -void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr, bool conj, bool transp) { - accel_statevec_anyCtrlAnyTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs, matr, conj, transp); -} - // T can be CompMatr2 or CompMatr template -void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, T matr, bool conj, bool transp) { - - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) - return; - - // skip straight to embarrasingly parallel simulation if possible - if (!doesGateRequireComm(qureg, targs)) { - - // using only the suffix ctrls - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); - anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, suffixCtrls, suffixCtrlStates, targs, matr, conj, transp); - return; - } - - // find suffix positions for all prefix targs, moving colliding ctrls out of the way - auto [newCtrls, newTargs] = getCtrlsAndTargsSwappedToMinSuffix(qureg, ctrls, targs); - - // only unmoved ctrls can be applied to the swaps, to accelerate them - auto [unmovedCtrls, unmovedCtrlStates] = getNonSwappedCtrlsAndStates(ctrls, ctrlStates, newCtrls); - - /// @todo - /// above, we track which control qubits are un-targeted by the SWAPs; such controls can be - /// seen as 'meta' to the entire operation, and so SHOULD be passable to the SWAPs below in - /// order to accelerate them (since more ctrls = fewer comm). However, this is strangely not - /// working; controlling the SWAPs upon these 'meta' control qubits is breaking the unit tests! - /// Until we better understand this, we disable this optimisation by removing all SWAP controls. - unmovedCtrls = lists_getEmptyList64(); - unmovedCtrlStates = lists_getEmptyList64(); - - // perform necessary swaps to move all targets into suffix, invoking communication (swaps are real, so no need to conj) - anyCtrlMultiSwapBetweenPrefixAndSuffix(qureg, unmovedCtrls, unmovedCtrlStates, targs, newTargs); - - // if the moved ctrls do not eliminate this node's need for local simulation... - if (doAnyLocalStatesHaveQubitValues(qureg, newCtrls, ctrlStates)) { - - // perform embarrassingly parallel simulation using only the new suffix ctrls - auto [newSuffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, newCtrls, ctrlStates); - anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, newSuffixCtrls, suffixCtrlStates, newTargs, matr, conj, transp); - } - - // undo swaps, again invoking communication - anyCtrlMultiSwapBetweenPrefixAndSuffix(qureg, unmovedCtrls, unmovedCtrlStates, targs, newTargs); -} - - -void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, lists_getList64({targ1,targ2}), matr, conj, transp); -} - - -void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr, bool conj, bool transp) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - // despite our use of compile-time templating, the bespoke one-targ routines are still faster - // than this any-targ routine when given a single target, because they can leverage a bespoke - // communication pattern (rather than swapping qubits into suffix), and pass the matrix elems - // to GPU kernels via arguments rather than global memory, which is faster for threads to read. - // Callers may however still choose this function (rather than the one-qubit specific one) for - // its convenient generality, so we divert to the one-targ routine when possible, copying the - // heap CPU matrix (assumed consistent with GPU memory) into stack memory - if (targs.size() == 1) - localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], getCompMatr1(matr.cpuElems), conj, transp); - - // similarly, bespoke two-targ routines are preferable although they offer no communication - // benefit because they call the same any-targ localiser, but still accelerate GPU memory access. - // this function call is the same as below, but we explicitly pass a CompMatr2 type in lieu of - // CompMatr, which avoids having to copy the CompMatr dynamic memory into accelerator backends - else if (targs.size() == 2) - localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], getCompMatr2(matr.cpuElems), conj, transp); - - // call the any-targ routine when given 3 or more targs, which may still invoke bespoke, - // fixed-targ instances of backend templated functions depending the number of targs - else - anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, matr, conj, transp); +void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, ConstList64 targs, + T matr, bool conj, bool transp) { + + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) + return; + + // skip straight to embarrasingly parallel simulation if possible + if (!doesGateRequireComm(qureg, targs)) { + + // using only the suffix ctrls + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); + anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, suffixCtrls, suffixCtrlStates, + targs, matr, conj, transp); + return; + } + + // find suffix positions for all prefix targs, moving colliding ctrls out of + // the way + auto [newCtrls, newTargs] = + getCtrlsAndTargsSwappedToMinSuffix(qureg, ctrls, targs); + + // only unmoved ctrls can be applied to the swaps, to accelerate them + auto [unmovedCtrls, unmovedCtrlStates] = + getNonSwappedCtrlsAndStates(ctrls, ctrlStates, newCtrls); + + /// @todo + /// above, we track which control qubits are un-targeted by the SWAPs; such + /// controls can be seen as 'meta' to the entire operation, and so SHOULD be + /// passable to the SWAPs below in order to accelerate them (since more ctrls + /// = fewer comm). However, this is strangely not working; controlling the + /// SWAPs upon these 'meta' control qubits is breaking the unit tests! Until + /// we better understand this, we disable this optimisation by removing all + /// SWAP controls. + unmovedCtrls = lists_getEmptyList64(); + unmovedCtrlStates = lists_getEmptyList64(); + + // perform necessary swaps to move all targets into suffix, invoking + // communication (swaps are real, so no need to conj) + anyCtrlMultiSwapBetweenPrefixAndSuffix(qureg, unmovedCtrls, unmovedCtrlStates, + targs, newTargs); + + // if the moved ctrls do not eliminate this node's need for local + // simulation... + if (doAnyLocalStatesHaveQubitValues(qureg, newCtrls, ctrlStates)) { + + // perform embarrassingly parallel simulation using only the new suffix + // ctrls + auto [newSuffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, newCtrls, ctrlStates); + anyCtrlTwoOrAnyTargDenseMatrOnSuffix( + qureg, newSuffixCtrls, suffixCtrlStates, newTargs, matr, conj, transp); + } + + // undo swaps, again invoking communication + anyCtrlMultiSwapBetweenPrefixAndSuffix(qureg, unmovedCtrls, unmovedCtrlStates, + targs, newTargs); +} + +void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + int targ1, int targ2, + CompMatr2 matr, bool conj, + bool transp) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); + + anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, + lists_getList64({targ1, targ2}), matr, conj, + transp); +} + +void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + ConstList64 targs, + CompMatr matr, bool conj, + bool transp) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); + + // despite our use of compile-time templating, the bespoke one-targ routines + // are still faster than this any-targ routine when given a single target, + // because they can leverage a bespoke communication pattern (rather than + // swapping qubits into suffix), and pass the matrix elems to GPU kernels via + // arguments rather than global memory, which is faster for threads to read. + // Callers may however still choose this function (rather than the one-qubit + // specific one) for its convenient generality, so we divert to the one-targ + // routine when possible, copying the heap CPU matrix (assumed consistent with + // GPU memory) into stack memory + if (targs.size() == 1) + localiser_statevec_anyCtrlOneTargDenseMatr( + qureg, ctrls, ctrlStates, targs[0], getCompMatr1(matr.cpuElems), conj, + transp); + + // similarly, bespoke two-targ routines are preferable although they offer no + // communication benefit because they call the same any-targ localiser, but + // still accelerate GPU memory access. this function call is the same as + // below, but we explicitly pass a CompMatr2 type in lieu of CompMatr, which + // avoids having to copy the CompMatr dynamic memory into accelerator backends + else if (targs.size() == 2) + localiser_statevec_anyCtrlTwoTargDenseMatr( + qureg, ctrls, ctrlStates, targs[0], targs[1], + getCompMatr2(matr.cpuElems), conj, transp); + + // call the any-targ routine when given 3 or more targs, which may still + // invoke bespoke, fixed-targ instances of backend templated functions + // depending the number of targs + else + anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, matr, conj, + transp); } - - /* * ANY-TARGET DIAGONAL MATRIX * @@ -1078,1252 +1156,1344 @@ void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, * because diagonals are always embarrassingly parallel */ +void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, int targ, + DiagMatr1 matr, bool conj) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); -void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr, bool conj) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) - return; + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) + return; - if (conj) - matr = util_getConj(matr); + if (conj) + matr = util_getConj(matr); - // only suffix control qubits are relevant to local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); - accel_statevec_anyCtrlOneTargDiagMatr_sub(qureg, suffixCtrls, suffixCtrlStates, targ, matr); + // only suffix control qubits are relevant to local amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); + accel_statevec_anyCtrlOneTargDiagMatr_sub(qureg, suffixCtrls, + suffixCtrlStates, targ, matr); } +void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + int targ1, int targ2, + DiagMatr2 matr, bool conj) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); -void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr, bool conj) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) + return; - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) - return; + if (conj) + matr = util_getConj(matr); - if (conj) - matr = util_getConj(matr); - - // only suffix control qubits are relevant to local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); - accel_statevec_anyCtrlTwoTargDiagMatr_sub(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2, matr); + // only suffix control qubits are relevant to local amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); + accel_statevec_anyCtrlTwoTargDiagMatr_sub( + qureg, suffixCtrls, suffixCtrlStates, targ1, targ2, matr); } +void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + ConstList64 targs, DiagMatr matr, + qcomp exponent, bool conj) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); -void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent, bool conj) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) - return; + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates)) + return; - // only suffix control qubits are relevant to local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); - accel_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, suffixCtrls, suffixCtrlStates, targs, matr, exponent, conj); + // only suffix control qubits are relevant to local amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, ctrls, ctrlStates); + accel_statevec_anyCtrlAnyTargDiagMatr_sub( + qureg, suffixCtrls, suffixCtrlStates, targs, matr, exponent, conj); } - - /* * ALL-TARGET DIAGONAL MATRIX */ - -void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) { - assert_localiserGivenStateVec(qureg); - - // matr and qureg are equal dimension, but may be distributed differently - bool quregDist = qureg.isDistributed; - bool matrDist = matr.isDistributed; - - // cannot distribute only matr; qureg has no buffer space to receive a broadcast. - // allocating temporary buffer space is too dangerous, since it would be the same - // size as matr and qureg, potentially increasing user memory by a factor x1.5. - if (!quregDist && matrDist) - error_localiserGivenDistribMatrixAndLocalQureg(); - - // embarrassingly parallel when both distributed or both local - if (quregDist == matrDist) - accel_statevec_allTargDiagMatr_sub(qureg, matr, exponent); - - // embarrasingly parallel when only qureg is distributed (all nodes have all needed matr elems) - if (quregDist && !matrDist) { - auto copy = getSpoofedDistributedMatrFromDistributedQureg(matr, qureg); - accel_statevec_allTargDiagMatr_sub(qureg, copy, exponent); - } -} - - -void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight) { - assert_localiserGivenDensMatr(qureg); - - // the diagonal matr has quadratically fewer elements than the density-matrix - // qureg, so could in-theory be effected (without catastrophic performance) as - // an N/2-qubit DiagMatr upon an N-qubit statevector. This requires O(N) bitwise - // operations per-iteration which might cause a slowdown in non-memory-bandwidth - // bound settings (e.g. 8 qubit Quregs). So we here use an O(1) bespoke method. - - // since Qureg is quadratically bigger than matr, it is likely they have different - // distributions (matr is probably local). Because every column of qureg is - // dot-multiplied with the full matr, every node requires all matr elements - bool quregDist = qureg.isDistributed; - bool matrDist = matr.isDistributed; - - // cannot distribute only matr; qureg has no buffer space to receive a broadcast. - // in theory, we could allocate temporary buffer space which would only be - // quadratically smaller than qureg; but this is a ludicrous scenario to support. - if (!quregDist && matrDist) { - error_localiserGivenDistribMatrixAndLocalQureg(); - return; - } - - // when the matrix is not distributed, we call the same routine despite whether qureg - // is distributed or not; that merely changes how many qureg columns get updated - if (!matrDist) { - accel_densmatr_allTargDiagMatr_subA(qureg, matr, exponent, applyLeft, applyRight, conjRight); - return; - } - - // finally, when both are distributed, qureg has buffer space to receive all matr - comm_combineElemsIntoBuffer(qureg, matr); - - // matr elems are inside qureg buffer, but we still pass matr struct along to - // accelerator, because it is going to perform mischief to re-use subA(). - accel_densmatr_allTargDiagMatr_subB(qureg, matr, exponent, applyLeft, applyRight, conjRight); +void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, + qcomp exponent) { + assert_localiserGivenStateVec(qureg); + + // matr and qureg are equal dimension, but may be distributed differently + bool quregDist = qureg.isDistributed; + bool matrDist = matr.isDistributed; + + // cannot distribute only matr; qureg has no buffer space to receive a + // broadcast. allocating temporary buffer space is too dangerous, since it + // would be the same size as matr and qureg, potentially increasing user + // memory by a factor x1.5. + if (!quregDist && matrDist) + error_localiserGivenDistribMatrixAndLocalQureg(); + + // embarrassingly parallel when both distributed or both local + if (quregDist == matrDist) + accel_statevec_allTargDiagMatr_sub(qureg, matr, exponent); + + // embarrasingly parallel when only qureg is distributed (all nodes have all + // needed matr elems) + if (quregDist && !matrDist) { + auto copy = getSpoofedDistributedMatrFromDistributedQureg(matr, qureg); + accel_statevec_allTargDiagMatr_sub(qureg, copy, exponent); + } +} + +void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, + qcomp exponent, bool applyLeft, + bool applyRight, bool conjRight) { + assert_localiserGivenDensMatr(qureg); + + // the diagonal matr has quadratically fewer elements than the density-matrix + // qureg, so could in-theory be effected (without catastrophic performance) as + // an N/2-qubit DiagMatr upon an N-qubit statevector. This requires O(N) + // bitwise operations per-iteration which might cause a slowdown in + // non-memory-bandwidth bound settings (e.g. 8 qubit Quregs). So we here use + // an O(1) bespoke method. + + // since Qureg is quadratically bigger than matr, it is likely they have + // different distributions (matr is probably local). Because every column of + // qureg is dot-multiplied with the full matr, every node requires all matr + // elements + bool quregDist = qureg.isDistributed; + bool matrDist = matr.isDistributed; + + // cannot distribute only matr; qureg has no buffer space to receive a + // broadcast. in theory, we could allocate temporary buffer space which would + // only be quadratically smaller than qureg; but this is a ludicrous scenario + // to support. + if (!quregDist && matrDist) { + error_localiserGivenDistribMatrixAndLocalQureg(); + return; + } + + // when the matrix is not distributed, we call the same routine despite + // whether qureg is distributed or not; that merely changes how many qureg + // columns get updated + if (!matrDist) { + accel_densmatr_allTargDiagMatr_subA(qureg, matr, exponent, applyLeft, + applyRight, conjRight); + return; + } + + // finally, when both are distributed, qureg has buffer space to receive all + // matr + comm_combineElemsIntoBuffer(qureg, matr); + + // matr elems are inside qureg buffer, but we still pass matr struct along to + // accelerator, because it is going to perform mischief to re-use subA(). + accel_densmatr_allTargDiagMatr_subB(qureg, matr, exponent, applyLeft, + applyRight, conjRight); } - - /* - * ANY-TARGET ANY-TYPE MATRIX - * + * ANY-TARGET ANY-TYPE MATRIX + * * This is merely a convenient gateway for callers to automatically * dispatch to the above specific functions, based on matrix type */ - template -void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, T matr, bool conj) { - - // this function is never invoked by operations whch require transposing matr - bool transp = false; - qcomp expo = 1; - - // suppress warnings since not used by all compile-time expansions below - (void) transp; - (void) expo; - - if constexpr (util_isDiagMatr ()) localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, ctrls, ctrlStates, targs, matr, expo, conj); - if constexpr (util_isDiagMatr1()) localiser_statevec_anyCtrlOneTargDiagMatr(qureg, ctrls, ctrlStates, targs[0], matr, conj); - if constexpr (util_isDiagMatr2()) localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], matr, conj); - if constexpr (util_isCompMatr ()) localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, matr, conj, transp); - if constexpr (util_isCompMatr1()) localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], matr, conj, transp); - if constexpr (util_isCompMatr2()) localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], matr, conj, transp); -} - -template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, DiagMatr, bool); -template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, DiagMatr1, bool); -template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, DiagMatr2, bool); -template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, CompMatr, bool); -template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, CompMatr1, bool); -template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, CompMatr2, bool); - - +void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + ConstList64 targs, T matr, + bool conj) { + + // this function is never invoked by operations whch require transposing matr + bool transp = false; + qcomp expo = 1; + + // suppress warnings since not used by all compile-time expansions below + (void)transp; + (void)expo; + + if constexpr (util_isDiagMatr()) + localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, ctrls, ctrlStates, targs, + matr, expo, conj); + if constexpr (util_isDiagMatr1()) + localiser_statevec_anyCtrlOneTargDiagMatr(qureg, ctrls, ctrlStates, + targs[0], matr, conj); + if constexpr (util_isDiagMatr2()) + localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, ctrls, ctrlStates, + targs[0], targs[1], matr, conj); + if constexpr (util_isCompMatr()) + localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, + matr, conj, transp); + if constexpr (util_isCompMatr1()) + localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, + targs[0], matr, conj, transp); + if constexpr (util_isCompMatr2()) + localiser_statevec_anyCtrlTwoTargDenseMatr( + qureg, ctrls, ctrlStates, targs[0], targs[1], matr, conj, transp); +} + +template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, + ConstList64, ConstList64, + DiagMatr, bool); +template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, + ConstList64, ConstList64, + DiagMatr1, bool); +template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, + ConstList64, ConstList64, + DiagMatr2, bool); +template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, + ConstList64, ConstList64, + CompMatr, bool); +template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, + ConstList64, ConstList64, + CompMatr1, bool); +template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, + ConstList64, ConstList64, + CompMatr2, bool); /* * PAULI TENSORS AND GADGETS */ - -void anyCtrlZTensorOrGadget(Qureg qureg, ConstList64 allCtrls, ConstList64 allCtrlStates, ConstList64 targs, bool isGadget, qcomp phase) { - - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, allCtrls, allCtrlStates)) - return; - - // retain only suffix control qubits, as relevant to local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, allCtrls, allCtrlStates); - - // prefixZ merely applies a node-wide factor to fac0 and fac1 - auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targs, qureg); - int sign = paulis_getPrefixZSign(qureg, prefixZ); - - // tensor multiplies +-1, gadget multiplies exp(+- i phase) - qcomp fac0 = (isGadget)? std::exp(+ phase * sign * 1_i) : +1 * sign; - qcomp fac1 = (isGadget)? std::exp(- phase * sign * 1_i) : -1 * sign; - - // simulation is always embarrassingly parallel - accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(qureg, suffixCtrls, suffixCtrlStates, suffixZ, fac0, fac1); -} - - -void anyCtrlPauliTensorOrGadget(Qureg qureg, ConstList64 allCtrls, ConstList64 allCtrlStates, PauliStr str, qcomp ampFac, qcomp pairAmpFac) { - - // this routine is invalid for str=ZI - if (!paulis_containsXOrY(str)) - error_localiserGivenPauliStrWithoutXorY(); - - // node has nothing to do if all local amps violate control condition - if (!doAnyLocalStatesHaveQubitValues(qureg, allCtrls, allCtrlStates)) - return; - - // retain only suffix control qubits, as relevant to local amp modification - auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, allCtrls, allCtrlStates); - - // partition non-Id Paulis into prefix and suffix, since... - // - prefix X,Y determine communication, because they apply bit-not to rank - // - prefix Y,Z determine node-wide coefficient, because they contain rank-determined !=1 elements - // - suffix X,Y,Z determine local amp coefficients - auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str); - auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg); - auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg); - auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg); - - // scale pair amp's coefficient by node-wide coeff - pairAmpFac *= paulis_getPrefixPaulisElem(qureg, prefixY, prefixZ); // 1 when embarrassingly parallel - - // embarrassingly parallel when there is only Z's in prefix - if (prefixX.empty() && prefixY.empty()) { - accel_statevector_anyCtrlPauliTensorOrGadget_subA( - qureg, suffixCtrls, suffixCtrlStates, suffixX, suffixY, suffixZ, ampFac, pairAmpFac); - return; - } - - // otherwise, we pair-wise communicate amps satisfying ctrls - auto prefixXY = util_getConcatenated(prefixX, prefixY); - int pairRank = util_getRankWithQubitsFlipped(prefixXY, qureg); - exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, suffixCtrls, suffixCtrlStates); - - // ctrls reduce communicated amps, so received buffer is compacted; - // we must ergo prepare a no-ctrl XY mask for accessing buffer elems - auto sortedCtrls = util_getSorted(suffixCtrls); - auto suffixMaskXY = util_getBitMask(util_getConcatenated(suffixX, suffixY)); - auto bufferMaskXY = removeBits(suffixMaskXY, sortedCtrls.data(), sortedCtrls.size()); - - accel_statevector_anyCtrlPauliTensorOrGadget_subB( - qureg, suffixCtrls, suffixCtrlStates, suffixX, suffixY, suffixZ, ampFac, pairAmpFac, bufferMaskXY); -} - - -void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, PauliStr str, qcomp factor) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - // this function accepts a global factor, so that density matrices can effect conj(pauli) - - if (paulis_containsXOrY(str)) { - - // (X|Y)|0/1> ~ |1/0> - qcomp ampFac = 0 * factor; - qcomp pairAmpFac = 1 * factor; - anyCtrlPauliTensorOrGadget(qureg, ctrls, ctrlStates, str, ampFac, pairAmpFac); - - } else { - // global factor is inapplicable to all-Z and is ignored - if (factor != qcomp(1,0)) - error_localiserGivenNonUnityGlobalFactorToZTensor(); - - bool isGadget = false; - qreal phase = 0; // ignored - anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, paulis_getTargetInds(str), isGadget, phase); - } -} - - -void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, qcomp phase) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - bool isGadget = true; - anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, targs, isGadget, phase); -} - - -void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, PauliStr str, qcomp phase) { - assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); - - // when str=IZ, we must use the above bespoke algorithm - if (!paulis_containsXOrY(str)) { - localiser_statevec_anyCtrlPhaseGadget(qureg, ctrls, ctrlStates, paulis_getTargetInds(str), phase); - return; - } - - qcomp ampFac = std::cos(phase); - qcomp pairAmpFac = std::sin(phase) * 1_i; - anyCtrlPauliTensorOrGadget(qureg, ctrls, ctrlStates, str, ampFac, pairAmpFac); +void anyCtrlZTensorOrGadget(Qureg qureg, ConstList64 allCtrls, + ConstList64 allCtrlStates, ConstList64 targs, + bool isGadget, qcomp phase) { + + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, allCtrls, allCtrlStates)) + return; + + // retain only suffix control qubits, as relevant to local amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, allCtrls, allCtrlStates); + + // prefixZ merely applies a node-wide factor to fac0 and fac1 + auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targs, qureg); + int sign = paulis_getPrefixZSign(qureg, prefixZ); + + // tensor multiplies +-1, gadget multiplies exp(+- i phase) + qcomp fac0 = (isGadget) ? std::exp(+phase * sign * 1_i) : +1 * sign; + qcomp fac1 = (isGadget) ? std::exp(-phase * sign * 1_i) : -1 * sign; + + // simulation is always embarrassingly parallel + accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( + qureg, suffixCtrls, suffixCtrlStates, suffixZ, fac0, fac1); +} + +void anyCtrlPauliTensorOrGadget(Qureg qureg, ConstList64 allCtrls, + ConstList64 allCtrlStates, PauliStr str, + qcomp ampFac, qcomp pairAmpFac) { + + // this routine is invalid for str=ZI + if (!paulis_containsXOrY(str)) + error_localiserGivenPauliStrWithoutXorY(); + + // node has nothing to do if all local amps violate control condition + if (!doAnyLocalStatesHaveQubitValues(qureg, allCtrls, allCtrlStates)) + return; + + // retain only suffix control qubits, as relevant to local amp modification + auto [suffixCtrls, suffixCtrlStates] = + getSuffixQubitsAndStates(qureg, allCtrls, allCtrlStates); + + // partition non-Id Paulis into prefix and suffix, since... + // - prefix X,Y determine communication, because they apply bit-not to rank + // - prefix Y,Z determine node-wide coefficient, because they contain + // rank-determined !=1 elements + // - suffix X,Y,Z determine local amp coefficients + auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str); + auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg); + auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg); + auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg); + + // scale pair amp's coefficient by node-wide coeff + pairAmpFac *= paulis_getPrefixPaulisElem( + qureg, prefixY, prefixZ); // 1 when embarrassingly parallel + + // embarrassingly parallel when there is only Z's in prefix + if (prefixX.empty() && prefixY.empty()) { + accel_statevector_anyCtrlPauliTensorOrGadget_subA( + qureg, suffixCtrls, suffixCtrlStates, suffixX, suffixY, suffixZ, ampFac, + pairAmpFac); + return; + } + + // otherwise, we pair-wise communicate amps satisfying ctrls + auto prefixXY = util_getConcatenated(prefixX, prefixY); + int pairRank = util_getRankWithQubitsFlipped(prefixXY, qureg); + exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, suffixCtrls, + suffixCtrlStates); + + // ctrls reduce communicated amps, so received buffer is compacted; + // we must ergo prepare a no-ctrl XY mask for accessing buffer elems + auto sortedCtrls = util_getSorted(suffixCtrls); + auto suffixMaskXY = util_getBitMask(util_getConcatenated(suffixX, suffixY)); + auto bufferMaskXY = + removeBits(suffixMaskXY, sortedCtrls.data(), sortedCtrls.size()); + + accel_statevector_anyCtrlPauliTensorOrGadget_subB( + qureg, suffixCtrls, suffixCtrlStates, suffixX, suffixY, suffixZ, ampFac, + pairAmpFac, bufferMaskXY); +} + +void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, PauliStr str, + qcomp factor) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); + + // this function accepts a global factor, so that density matrices can effect + // conj(pauli) + + if (paulis_containsXOrY(str)) { + + // (X|Y)|0/1> ~ |1/0> + qcomp ampFac = 0 * factor; + qcomp pairAmpFac = 1 * factor; + anyCtrlPauliTensorOrGadget(qureg, ctrls, ctrlStates, str, ampFac, + pairAmpFac); + + } else { + // global factor is inapplicable to all-Z and is ignored + if (factor != qcomp(1, 0)) + error_localiserGivenNonUnityGlobalFactorToZTensor(); + + bool isGadget = false; + qreal phase = 0; // ignored + anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, paulis_getTargetInds(str), + isGadget, phase); + } +} + +void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, + ConstList64 targs, qcomp phase) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); + + bool isGadget = true; + anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, targs, isGadget, phase); +} + +void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, ConstList64 ctrls, + ConstList64 ctrlStates, PauliStr str, + qcomp phase) { + assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size()); + + // when str=IZ, we must use the above bespoke algorithm + if (!paulis_containsXOrY(str)) { + localiser_statevec_anyCtrlPhaseGadget(qureg, ctrls, ctrlStates, + paulis_getTargetInds(str), phase); + return; + } + + qcomp ampFac = std::cos(phase); + qcomp pairAmpFac = std::sin(phase) * 1_i; + anyCtrlPauliTensorOrGadget(qureg, ctrls, ctrlStates, str, ampFac, pairAmpFac); } - - /* * QUREG COMBINATION */ +void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, + vector coeffs, + vector inQuregs) { -void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, vector coeffs, vector inQuregs) { - - /// @todo - /// this function requires (as validated) distributions are identical. - /// It would be trivial to generalise this so that Qureg distributions - /// can differ (we merely spoof local Quregs, offsetting their memory). - /// They must still however be identically GPU-accelerated; this is a - /// low priority because this situation is non-sensical + /// @todo + /// this function requires (as validated) distributions are identical. + /// It would be trivial to generalise this so that Qureg distributions + /// can differ (we merely spoof local Quregs, offsetting their memory). + /// They must still however be identically GPU-accelerated; this is a + /// low priority because this situation is non-sensical - accel_statevec_setQuregToWeightedSum_sub(outQureg, coeffs, inQuregs); + accel_statevec_setQuregToWeightedSum_sub(outQureg, coeffs, inQuregs); } - void localiser_statevec_setQuregToClone(Qureg out, Qureg in) { - /// @todo - /// we lazily re-use setQuregToWeightedSum(), inducing a gratuitous - /// x1 multiplication per element which we expected is completely - /// occluded by memory movement costs. We should check this and - /// potentially replace this function with (NUMA-aware?) memory copying! + /// @todo + /// we lazily re-use setQuregToWeightedSum(), inducing a gratuitous + /// x1 multiplication per element which we expected is completely + /// occluded by memory movement costs. We should check this and + /// potentially replace this function with (NUMA-aware?) memory copying! - localiser_statevec_setQuregToWeightedSum(out, {1}, {in}); + localiser_statevec_setQuregToWeightedSum(out, {1}, {in}); } +void mixDensityMatrixWithStatevector(qreal outProb, Qureg out, qreal inProb, + Qureg in) { -void mixDensityMatrixWithStatevector(qreal outProb, Qureg out, qreal inProb, Qureg in) { - - // we can handle 3 out of 4 possible combinations of distribution, - // and accelerator.hpp will handle every combination of GPU-accel - bool outDist = out.isDistributed; - bool inDist = in.isDistributed; + // we can handle 3 out of 4 possible combinations of distribution, + // and accelerator.hpp will handle every combination of GPU-accel + bool outDist = out.isDistributed; + bool inDist = in.isDistributed; - // illegal to distribute only the smaller Qureg; 'out' has no buffer space to receive it. - // in theory, we could allocate a temporary receive buffer since it is merely quadratically - // smaller than 'out' so will be a negligible memory overhead; but this scenario (having a - // smaller distributed Qureg) is completely ludicrous and unworth supporting - if (!outDist && inDist) - error_mixQuregsAreLocalDensMatrAndDistribStatevec(); + // illegal to distribute only the smaller Qureg; 'out' has no buffer space to + // receive it. in theory, we could allocate a temporary receive buffer since + // it is merely quadratically smaller than 'out' so will be a negligible + // memory overhead; but this scenario (having a smaller distributed Qureg) is + // completely ludicrous and unworth supporting + if (!outDist && inDist) + error_mixQuregsAreLocalDensMatrAndDistribStatevec(); - // both non-distributed is trivial - if (!outDist && !inDist) - accel_densmatr_mixQureg_subB(outProb, out, inProb, in); + // both non-distributed is trivial + if (!outDist && !inDist) + accel_densmatr_mixQureg_subB(outProb, out, inProb, in); - // both distributed requires broadcasting 'in' into every node's 'out' buffer - if (outDist && inDist) { - comm_combineAmpsIntoBuffer(out, in); // uses same buffer that subC() consults - accel_densmatr_mixQureg_subC(outProb, out, inProb); - } + // both distributed requires broadcasting 'in' into every node's 'out' buffer + if (outDist && inDist) { + comm_combineAmpsIntoBuffer(out, + in); // uses same buffer that subC() consults + accel_densmatr_mixQureg_subC(outProb, out, inProb); + } - // only 'out' being distributed means simulation is embarrasingly parallel, - // because the full 'in' is already known on every node - if (outDist && !inDist) - accel_densmatr_mixQureg_subD(outProb, out, inProb, in); + // only 'out' being distributed means simulation is embarrasingly parallel, + // because the full 'in' is already known on every node + if (outDist && !inDist) + accel_densmatr_mixQureg_subD(outProb, out, inProb, in); } +void localiser_densmatr_mixQureg(qreal outProb, Qureg out, qreal inProb, + Qureg in) { + assert_localiserGivenDensMatr(out); -void localiser_densmatr_mixQureg(qreal outProb, Qureg out, qreal inProb, Qureg in) { - assert_localiserGivenDensMatr(out); - - (in.isDensityMatrix)? - accel_densmatr_mixQureg_subA(outProb, out, inProb, in): - mixDensityMatrixWithStatevector(outProb, out, inProb, in); + (in.isDensityMatrix) + ? accel_densmatr_mixQureg_subA(outProb, out, inProb, in) + : mixDensityMatrixWithStatevector(outProb, out, inProb, in); } - - /* * DEPHASING */ - void localiser_densmatr_oneQubitDephasing(Qureg qureg, int qubit, qreal prob) { - assert_localiserGivenDensMatr(qureg); + assert_localiserGivenDensMatr(qureg); - // both methods are embarrassingly parallel - (util_isBraQubitInSuffix(qubit, qureg))? - accel_densmatr_oneQubitDephasing_subA(qureg, qubit, prob): - accel_densmatr_oneQubitDephasing_subB(qureg, qubit, prob); + // both methods are embarrassingly parallel + (util_isBraQubitInSuffix(qubit, qureg)) + ? accel_densmatr_oneQubitDephasing_subA(qureg, qubit, prob) + : accel_densmatr_oneQubitDephasing_subB(qureg, qubit, prob); } +void localiser_densmatr_twoQubitDephasing(Qureg qureg, int qubit1, int qubit2, + qreal prob) { + assert_localiserGivenDensMatr(qureg); -void localiser_densmatr_twoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal prob) { - assert_localiserGivenDensMatr(qureg); - - // relative size of qubit1 and qubit2 does not matter - int leftQubit = std::max(qubit1, qubit2); + // relative size of qubit1 and qubit2 does not matter + int leftQubit = std::max(qubit1, qubit2); - // both methods are embarrassingly parallel - (util_isBraQubitInSuffix(leftQubit, qureg))? - accel_densmatr_twoQubitDephasing_subA(qureg, qubit1, qubit2, prob): - accel_densmatr_twoQubitDephasing_subB(qureg, qubit1, qubit2, prob); + // both methods are embarrassingly parallel + (util_isBraQubitInSuffix(leftQubit, qureg)) + ? accel_densmatr_twoQubitDephasing_subA(qureg, qubit1, qubit2, prob) + : accel_densmatr_twoQubitDephasing_subB(qureg, qubit1, qubit2, prob); } - - /* * ONE-QUBIT DEPOLARISING */ - void oneQubitDepolarisingOnPrefix(Qureg qureg, int ketQubit, qreal prob) { - assert_localiserGivenDensMatr(qureg); + assert_localiserGivenDensMatr(qureg); - // pack and exchange amps to buffers where local ket qubit and fixed-prefix-bra qubit agree - int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); - int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg); - exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, lists_getList64({ketQubit}), lists_getList64({braBit})); + // pack and exchange amps to buffers where local ket qubit and + // fixed-prefix-bra qubit agree + int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); + int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg); + exchangeAmpsToBuffersWhereQubitsAreInStates( + qureg, pairRank, lists_getList64({ketQubit}), lists_getList64({braBit})); - // use received sub-buffer to update local amps - accel_densmatr_oneQubitDepolarising_subB(qureg, ketQubit, prob); + // use received sub-buffer to update local amps + accel_densmatr_oneQubitDepolarising_subB(qureg, ketQubit, prob); } +void localiser_densmatr_oneQubitDepolarising(Qureg qureg, int qubit, + qreal prob) { + assert_localiserGivenDensMatr(qureg); -void localiser_densmatr_oneQubitDepolarising(Qureg qureg, int qubit, qreal prob) { - assert_localiserGivenDensMatr(qureg); - - // perform embarrassingly parallel routine or pairwise communication - (doesChannelRequireComm(qureg, qubit))? - oneQubitDepolarisingOnPrefix(qureg, qubit, prob): - accel_densmatr_oneQubitDepolarising_subA(qureg, qubit, prob); + // perform embarrassingly parallel routine or pairwise communication + (doesChannelRequireComm(qureg, qubit)) + ? oneQubitDepolarisingOnPrefix(qureg, qubit, prob) + : accel_densmatr_oneQubitDepolarising_subA(qureg, qubit, prob); } - - /* * TWO-QUBIT DEPOLARISING */ +void twoQubitDepolarisingOnPrefixAndSuffix(Qureg qureg, int ketQb1, int ketQb2, + qreal prob) { + assert_localiserGivenDensMatr(qureg); -void twoQubitDepolarisingOnPrefixAndSuffix(Qureg qureg, int ketQb1, int ketQb2, qreal prob) { - assert_localiserGivenDensMatr(qureg); + // scale 25% of amps; precisely those which are not communicated + accel_densmatr_twoQubitDepolarising_subC(qureg, ketQb1, ketQb2, prob); - // scale 25% of amps; precisely those which are not communicated - accel_densmatr_twoQubitDepolarising_subC(qureg, ketQb1, ketQb2, prob); + // pack an eighth of the buffer with pair-summed amps + int braQb1 = util_getBraQubit(ketQb1, qureg); + int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); + qindex numPacked = accel_statevec_packPairSummedAmpsIntoBuffer( + qureg, ketQb1, ketQb2, braQb1, braBit2); - // pack an eighth of the buffer with pair-summed amps - int braQb1 = util_getBraQubit(ketQb1, qureg); - int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); - qindex numPacked = accel_statevec_packPairSummedAmpsIntoBuffer(qureg, ketQb1, ketQb2, braQb1, braBit2); + // exchange sub-buffers + int pairRank = util_getRankWithBraQubitFlipped(ketQb2, qureg); + comm_exchangeSubBuffers(qureg, numPacked, pairRank); - // exchange sub-buffers - int pairRank = util_getRankWithBraQubitFlipped(ketQb2, qureg); - comm_exchangeSubBuffers(qureg, numPacked, pairRank); - - // update 25% of local amps using received buffer amps - accel_densmatr_twoQubitDepolarising_subD(qureg, ketQb1, ketQb2, prob); + // update 25% of local amps using received buffer amps + accel_densmatr_twoQubitDepolarising_subD(qureg, ketQb1, ketQb2, prob); } +void twoQubitDepolarisingOnPrefixAndPrefix(Qureg qureg, int ketQb1, int ketQb2, + qreal prob) { + assert_localiserGivenDensMatr(qureg); -void twoQubitDepolarisingOnPrefixAndPrefix(Qureg qureg, int ketQb1, int ketQb2, qreal prob) { - assert_localiserGivenDensMatr(qureg); - - int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); - int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); + int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); + int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); - // pack unscaled amps before subsequent scaling - auto ketList = lists_getList64({ketQb1,ketQb2}); - qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, ketList, lists_getList64({braBit1,braBit2})); + // pack unscaled amps before subsequent scaling + auto ketList = lists_getList64({ketQb1, ketQb2}); + qindex numPacked = accel_statevec_packAmpsIntoBuffer( + qureg, ketList, lists_getList64({braBit1, braBit2})); - // scale all amps - accel_densmatr_twoQubitDepolarising_subE(qureg, ketQb1, ketQb2, prob); + // scale all amps + accel_densmatr_twoQubitDepolarising_subE(qureg, ketQb1, ketQb2, prob); - // swap the buffer with 3 other nodes to update local amps - int pairRank1 = util_getRankWithBraQubitFlipped(ketQb1, qureg); - int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg); - int pairRank3 = util_getRankWithBraQubitsFlipped(ketList, qureg); + // swap the buffer with 3 other nodes to update local amps + int pairRank1 = util_getRankWithBraQubitFlipped(ketQb1, qureg); + int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg); + int pairRank3 = util_getRankWithBraQubitsFlipped(ketList, qureg); - comm_exchangeSubBuffers(qureg, numPacked, pairRank1); - accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob); + comm_exchangeSubBuffers(qureg, numPacked, pairRank1); + accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob); - comm_exchangeSubBuffers(qureg, numPacked, pairRank2); - accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob); + comm_exchangeSubBuffers(qureg, numPacked, pairRank2); + accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob); - comm_exchangeSubBuffers(qureg, numPacked, pairRank3); - accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob); + comm_exchangeSubBuffers(qureg, numPacked, pairRank3); + accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob); } +void localiser_densmatr_twoQubitDepolarising(Qureg qureg, int qubit1, + int qubit2, qreal prob) { + assert_localiserGivenDensMatr(qureg); -void localiser_densmatr_twoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal prob) { - assert_localiserGivenDensMatr(qureg); - - // ensure qubit2 > qubit1 - if (qubit1 > qubit2) - std::swap(qubit1, qubit2); + // ensure qubit2 > qubit1 + if (qubit1 > qubit2) + std::swap(qubit1, qubit2); - // determine necessary communication - bool comm1 = doesChannelRequireComm(qureg, qubit1); - bool comm2 = doesChannelRequireComm(qureg, qubit2); + // determine necessary communication + bool comm1 = doesChannelRequireComm(qureg, qubit1); + bool comm2 = doesChannelRequireComm(qureg, qubit2); - if (comm2 && comm1) - twoQubitDepolarisingOnPrefixAndPrefix(qureg, qubit1, qubit2, prob); - if (comm2 && !comm1) - twoQubitDepolarisingOnPrefixAndSuffix(qureg, qubit1, qubit2, prob); - if (!comm2 && !comm1) { - accel_densmatr_twoQubitDepolarising_subA(qureg, qubit1, qubit2, prob); - accel_densmatr_twoQubitDepolarising_subB(qureg, qubit1, qubit2, prob); - } + if (comm2 && comm1) + twoQubitDepolarisingOnPrefixAndPrefix(qureg, qubit1, qubit2, prob); + if (comm2 && !comm1) + twoQubitDepolarisingOnPrefixAndSuffix(qureg, qubit1, qubit2, prob); + if (!comm2 && !comm1) { + accel_densmatr_twoQubitDepolarising_subA(qureg, qubit1, qubit2, prob); + accel_densmatr_twoQubitDepolarising_subB(qureg, qubit1, qubit2, prob); + } } - - /* * PAULI CHANNEL */ +void oneQubitPauliChannelOnPrefix(Qureg qureg, int ketQubit, qreal probI, + qreal probX, qreal probY, qreal probZ) { + assert_localiserGivenDensMatr(qureg); -void oneQubitPauliChannelOnPrefix(Qureg qureg, int ketQubit, qreal probI, qreal probX, qreal probY, qreal probZ) { - assert_localiserGivenDensMatr(qureg); - - // exchange all amps with pair node - int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg); - comm_exchangeAmpsToBuffers(qureg, pairRank); + // exchange all amps with pair node + int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg); + comm_exchangeAmpsToBuffers(qureg, pairRank); - // use received buffer to update local amps - accel_densmatr_oneQubitPauliChannel_subB(qureg, ketQubit, probI, probX, probY, probZ); + // use received buffer to update local amps + accel_densmatr_oneQubitPauliChannel_subB(qureg, ketQubit, probI, probX, probY, + probZ); } +void localiser_densmatr_oneQubitPauliChannel(Qureg qureg, int qubit, + qreal probX, qreal probY, + qreal probZ) { + assert_localiserGivenDensMatr(qureg); -void localiser_densmatr_oneQubitPauliChannel(Qureg qureg, int qubit, qreal probX, qreal probY, qreal probZ) { - assert_localiserGivenDensMatr(qureg); + // infer the no-error probability + qreal probI = 1 - probX - probY - probZ; - // infer the no-error probability - qreal probI = 1 - probX - probY - probZ; - - (doesChannelRequireComm(qureg, qubit))? - oneQubitPauliChannelOnPrefix(qureg, qubit, probI, probX, probY, probZ): - accel_densmatr_oneQubitPauliChannel_subA(qureg, qubit, probI, probX, probY, probZ); + (doesChannelRequireComm(qureg, qubit)) + ? oneQubitPauliChannelOnPrefix(qureg, qubit, probI, probX, probY, probZ) + : accel_densmatr_oneQubitPauliChannel_subA(qureg, qubit, probI, probX, + probY, probZ); } - -// twoQubitPauliChannel() is regrettably too difficult; the communication model cannot be -// simplified the way it was in twoQubitDepolarising() which leveraged the uniform -// coefficients. It is not clear whether arbitrary coefficients, which cause many more -// amplitudes to mix, can ever be performed in a sequence of pairwise communication - - +// twoQubitPauliChannel() is regrettably too difficult; the communication model +// cannot be simplified the way it was in twoQubitDepolarising() which leveraged +// the uniform coefficients. It is not clear whether arbitrary coefficients, +// which cause many more amplitudes to mix, can ever be performed in a sequence +// of pairwise communication /* * AMPLITUDE DAMPING */ - void oneQubitDampingOnPrefix(Qureg qureg, int ketQubit, qreal prob) { - int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); - int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg); - qindex numAmps = qureg.numAmpsPerNode / 2; + int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); + int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg); + qindex numAmps = qureg.numAmpsPerNode / 2; - // half of all nodes... - if (braBit == 1) { + // half of all nodes... + if (braBit == 1) { - // pack and async send half the buffer - accel_statevec_packAmpsIntoBuffer(qureg, lists_getList64({ketQubit}), lists_getList64({1})); - comm_asynchSendSubBuffer(qureg, numAmps, pairRank); + // pack and async send half the buffer + accel_statevec_packAmpsIntoBuffer(qureg, lists_getList64({ketQubit}), + lists_getList64({1})); + comm_asynchSendSubBuffer(qureg, numAmps, pairRank); - // scale the local amps which were just sent - accel_densmatr_oneQubitDamping_subB(qureg, ketQubit, prob); - } + // scale the local amps which were just sent + accel_densmatr_oneQubitDamping_subB(qureg, ketQubit, prob); + } - // all nodes scale the other half of their local amps - accel_densmatr_oneQubitDamping_subC(qureg, ketQubit, prob); + // all nodes scale the other half of their local amps + accel_densmatr_oneQubitDamping_subC(qureg, ketQubit, prob); - // the other remaining half of all nodes... - if (braBit == 0) { + // the other remaining half of all nodes... + if (braBit == 0) { - // receive the async-sent buffer - comm_receiveArrayToBuffer(qureg, numAmps, pairRank); - accel_densmatr_oneQubitDamping_subD(qureg, ketQubit, prob); - } + // receive the async-sent buffer + comm_receiveArrayToBuffer(qureg, numAmps, pairRank); + accel_densmatr_oneQubitDamping_subD(qureg, ketQubit, prob); + } - // prevent asynch senders from proceeding so their buffer isn't prematurely modified - comm_sync(); + // prevent asynch senders from proceeding so their buffer isn't prematurely + // modified + comm_sync(); } - void localiser_densmatr_oneQubitDamping(Qureg qureg, int qubit, qreal prob) { - assert_localiserGivenDensMatr(qureg); + assert_localiserGivenDensMatr(qureg); - (doesChannelRequireComm(qureg, qubit))? - oneQubitDampingOnPrefix(qureg, qubit, prob): - accel_densmatr_oneQubitDamping_subA(qureg, qubit, prob); + (doesChannelRequireComm(qureg, qubit)) + ? oneQubitDampingOnPrefix(qureg, qubit, prob) + : accel_densmatr_oneQubitDamping_subA(qureg, qubit, prob); } - - /* * SUPEROPERATORS AND KRAUS */ - CompMatr getSpoofedCompMatrFromSuperOp(SuperOp op) { - // prepare output CompMatr (avoiding C++20 designated initialiser) - CompMatr out; - - // superoperator acts on twice as many qubits - out.numQubits = 2 * op.numQubits; - out.numRows = op.numRows; + // prepare output CompMatr (avoiding C++20 designated initialiser) + CompMatr out; - // heap fields are not consulted - out.isApproxUnitary = nullptr; - out.isApproxHermitian = nullptr; - out.wasGpuSynced = nullptr; - - // copy pointers (noting cpuElems is 2D/nested) - out.cpuElems = op.cpuElems; - out.cpuElemsFlat = op.cpuElemsFlat; - out.gpuElemsFlat = op.gpuElemsFlat; - - return out; -} + // superoperator acts on twice as many qubits + out.numQubits = 2 * op.numQubits; + out.numRows = op.numRows; + // heap fields are not consulted + out.isApproxUnitary = nullptr; + out.isApproxHermitian = nullptr; + out.wasGpuSynced = nullptr; -void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, ConstList64 ketTargs) { - assert_localiserGivenDensMatr(qureg); + // copy pointers (noting cpuElems is 2D/nested) + out.cpuElems = op.cpuElems; + out.cpuElemsFlat = op.cpuElemsFlat; + out.gpuElemsFlat = op.gpuElemsFlat; - // effect the superoperator as a dense matrix on the ket + bra qubits - bool conj = false; - bool transp = false; - auto braTargs = util_getBraQubits(ketTargs, qureg); - auto allTargs = util_getConcatenated(ketTargs, braTargs); - CompMatr matr = getSpoofedCompMatrFromSuperOp(op); - List64 empty = lists_getEmptyList64(); - localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, empty, empty, allTargs, matr, conj, transp); + return out; } +void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, + ConstList64 ketTargs) { + assert_localiserGivenDensMatr(qureg); -void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, ConstList64 ketTargs) { - - // Kraus map is simulated through its existing superoperator - localiser_densmatr_superoperator(qureg, map.superop, ketTargs); + // effect the superoperator as a dense matrix on the ket + bra qubits + bool conj = false; + bool transp = false; + auto braTargs = util_getBraQubits(ketTargs, qureg); + auto allTargs = util_getConcatenated(ketTargs, braTargs); + CompMatr matr = getSpoofedCompMatrFromSuperOp(op); + List64 empty = lists_getEmptyList64(); + localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, empty, empty, allTargs, + matr, conj, transp); } +void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, + ConstList64 ketTargs) { + // Kraus map is simulated through its existing superoperator + localiser_densmatr_superoperator(qureg, map.superop, ketTargs); +} /* * PARTIAL TRACE */ +auto getNonTracedQubitOrder(Qureg qureg, ConstList64 originalTargs, + ConstList64 revisedTargs) { -auto getNonTracedQubitOrder(Qureg qureg, ConstList64 originalTargs, ConstList64 revisedTargs) { + // get a list of all the qureg's qubits when treated as a statevector + auto allQubits = util_getRange(2 * qureg.numQubits); - // get a list of all the qureg's qubits when treated as a statevector - auto allQubits = util_getRange(2 * qureg.numQubits); - - // determine the ordering of all the Qureg's qubits after swaps - for (size_t i=0; i Z and X <-> Y in the prefix qubits - // have identical communication patterns, and so can all be processed after one round - // of amps exchange. - - /// @todo - /// We can optimise further in a similar spirit to above by grouping identically- - /// communicating strings into those which differ only by suffix I <-> Z and X <-> Y, - /// which have identical amplitude-mixing patterns, to avoid repeated enumeration of - /// all amplitudes and reduce the associated memory-movement / caching costs. - /// (This is facilitated "for free" by cuStateVec in GPU settings, although we do not - /// wish to here differentiate localiser logic based on CPU vs GPU deployment) - - qcomp totalValue = 0; - - using Key = PAULI_MASK_TYPE; - using Term = tuple; - std::unordered_map> groups; - - // group sum's terms into those with identical communication patterns - for (int i=0; i Z and X <-> Y in + // the prefix qubits have identical communication patterns, and so can all be + // processed after one round of amps exchange. + + /// @todo + /// We can optimise further in a similar spirit to above by grouping + /// identically- communicating strings into those which differ only by suffix + /// I <-> Z and X <-> Y, which have identical amplitude-mixing patterns, to + /// avoid repeated enumeration of all amplitudes and reduce the associated + /// memory-movement / caching costs. (This is facilitated "for free" by + /// cuStateVec in GPU settings, although we do not wish to here differentiate + /// localiser logic based on CPU vs GPU deployment) + + qcomp totalValue = 0; + + using Key = PAULI_MASK_TYPE; + using Term = tuple; + std::unordered_map> groups; + + // group sum's terms into those with identical communication patterns + for (int i = 0; i < sum.numTerms; i++) { + Term term = tuple{sum.strings[i], sum.coeffs[i]}; + Key totalKey = paulis_getKeyOfSameMixedAmpsGroup(sum.strings[i]); + Key prefixKey = getBitsLeftOfIndex( + totalKey, qureg.logNumAmpsPerNode - 1); // 0 if !qureg.isDistributed + + groups[prefixKey].push_back(term); + } + + // process each group in-turn + for (auto &[key, terms] : groups) { + + // perform communication once per-group, if necessary + int pairRank = flipBits(qureg.rank, key); + if (pairRank != qureg.rank) + comm_exchangeAmpsToBuffers(qureg, pairRank); + + // determine backend function to invoke upon all terms in group (likely + // _subB) + auto termFunc = (pairRank == qureg.rank) + ? getStateVecExpecAllSuffixPauliStr + : accel_statevec_calcExpecPauliStr_subB; + + // for each term within the current group... + for (auto &[str, coeff] : terms) { + auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str); + auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg); + auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg); + auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg); + + // contribute coeff * prefix-coeff * suffix-sum + qcomp termFactor = paulis_getPrefixPaulisElem(qureg, prefixY, prefixZ); + qcomp termValue = termFactor * termFunc(qureg, suffixX, suffixY, suffixZ); + + /// @todo + /// use Kahan summation to improve (for free) the accuracy of totalValue + /// here! This sum is always serial, so we should always use it! It is + /// acceptable to grow a list and pass it to a utility function, since we + /// always assume the number of terms in the PauliStrSum is tractable! + totalValue += coeff * termValue; + + // prefixX wasn't used since it only informs pair-ranks which are not + // consulted here (due to being embarrassingly parallel); we suppress + // warning + (void)prefixX; } + } - // process each group in-turn - for (auto& [key, terms] : groups) { - - // perform communication once per-group, if necessary - int pairRank = flipBits(qureg.rank, key); - if (pairRank != qureg.rank) - comm_exchangeAmpsToBuffers(qureg, pairRank); - - // determine backend function to invoke upon all terms in group (likely _subB) - auto termFunc = (pairRank == qureg.rank)? - getStateVecExpecAllSuffixPauliStr : - accel_statevec_calcExpecPauliStr_subB; - - // for each term within the current group... - for (auto& [str, coeff] : terms) { - auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str); - auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg); - auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg); - auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg); - - // contribute coeff * prefix-coeff * suffix-sum - qcomp termFactor = paulis_getPrefixPaulisElem(qureg, prefixY, prefixZ); - qcomp termValue = termFactor * termFunc(qureg, suffixX, suffixY, suffixZ); - - /// @todo - /// use Kahan summation to improve (for free) the accuracy of totalValue - /// here! This sum is always serial, so we should always use it! It is - /// acceptable to grow a list and pass it to a utility function, since we - /// always assume the number of terms in the PauliStrSum is tractable! - totalValue += coeff * termValue; - - // prefixX wasn't used since it only informs pair-ranks which are not - // consulted here (due to being embarrassingly parallel); we suppress warning - (void) prefixX; - } - } - - // combine contributions from each node - if (qureg.isDistributed) - comm_reduceAmp(&totalValue); + // combine contributions from each node + if (qureg.isDistributed) + comm_reduceAmp(&totalValue); - return totalValue; + return totalValue; } - qcomp localiser_densmatr_calcExpecPauliStrSum(Qureg qureg, PauliStrSum sum) { - assert_localiserGivenDensMatr(qureg); + assert_localiserGivenDensMatr(qureg); - // TOOD: - // we can optimise this method by grouping sum's terms into strings which - // involve differ only by I <-> Z and X <-> Y, which have identical - // enumeration patterns and which can significantly reduce superfluous - // re-enumeration of the amps, reducing memroy-movement/caching costs. - // Explore this! + // TOOD: + // we can optimise this method by grouping sum's terms into strings which + // involve differ only by I <-> Z and X <-> Y, which have identical + // enumeration patterns and which can significantly reduce superfluous + // re-enumeration of the amps, reducing memroy-movement/caching costs. + // Explore this! - // every term of 'sum' is embarrassingly parallel to evaluate; sum all this node's contributions - qcomp value = 0; - for (qindex t=0; t +void cpu_statevec_packFusedMultiSwapBuffers_sub(Qureg qureg, const std::map& swapMap, int target_m, qcomp* buffer) { + + List64 sortedSufTargs = lists_getEmptyList64(); + List64 targetStates = lists_getEmptyList64(); + int bitIndex = 0; + + for (auto const& [suf, pre] : swapMap) { + sortedSufTargs.push_back(suf); + targetStates.push_back((target_m >> bitIndex) & 1); + bitIndex++; + } + + SET_VAR_AT_COMPILE_TIME(int, k, NumQubits, swapMap.size()); + qindex numIts = qureg.numAmpsPerNode >> k; + qindex qubitStateMask = util_getBitMask(sortedSufTargs, targetStates); + + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* outBuffer = getCpuQcompPtr(buffer); + + #pragma omp parallel for schedule(static) if(qureg.isMultithreaded) + for (qindex n = 0; n < numIts; n++) { + qindex i = insertBitsWithMaskedValues(n, sortedSufTargs.data(), k, qubitStateMask); + outBuffer[n] = amps[i]; + } +} + +void cpu_statevec_packFusedMultiSwapBuffers(Qureg qureg, const std::map& swapMap, int target_m, qcomp* buffer) { + int k = swapMap.size(); + if (k == 1) cpu_statevec_packFusedMultiSwapBuffers_sub<1>(qureg, swapMap, target_m, buffer); + else if (k == 2) cpu_statevec_packFusedMultiSwapBuffers_sub<2>(qureg, swapMap, target_m, buffer); + else if (k == 3) cpu_statevec_packFusedMultiSwapBuffers_sub<3>(qureg, swapMap, target_m, buffer); + else if (k == 4) cpu_statevec_packFusedMultiSwapBuffers_sub<4>(qureg, swapMap, target_m, buffer); + else if (k == 5) cpu_statevec_packFusedMultiSwapBuffers_sub<5>(qureg, swapMap, target_m, buffer); + else cpu_statevec_packFusedMultiSwapBuffers_sub<-1>(qureg, swapMap, target_m, buffer); +} + +template +void cpu_statevec_unpackFusedMultiSwapBuffers_sub(Qureg qureg, const std::map& swapMap, int target_m, qcomp* buffer) { + + List64 sortedSufTargs = lists_getEmptyList64(); + List64 targetStates = lists_getEmptyList64(); + int bitIndex = 0; + + for (auto const& [suf, pre] : swapMap) { + sortedSufTargs.push_back(suf); + targetStates.push_back((target_m >> bitIndex) & 1); + bitIndex++; + } + + SET_VAR_AT_COMPILE_TIME(int, k, NumQubits, swapMap.size()); + qindex numIts = qureg.numAmpsPerNode >> k; + qindex qubitStateMask = util_getBitMask(sortedSufTargs, targetStates); + + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* inBuffer = getCpuQcompPtr(buffer); + + #pragma omp parallel for schedule(static) if(qureg.isMultithreaded) + for (qindex n = 0; n < numIts; n++) { + qindex i = insertBitsWithMaskedValues(n, sortedSufTargs.data(), k, qubitStateMask); + amps[i] = inBuffer[n]; + } +} + +void cpu_statevec_unpackFusedMultiSwapBuffers(Qureg qureg, const std::map& swapMap, int target_m, qcomp* buffer) { + int k = swapMap.size(); + if (k == 1) cpu_statevec_unpackFusedMultiSwapBuffers_sub<1>(qureg, swapMap, target_m, buffer); + else if (k == 2) cpu_statevec_unpackFusedMultiSwapBuffers_sub<2>(qureg, swapMap, target_m, buffer); + else if (k == 3) cpu_statevec_unpackFusedMultiSwapBuffers_sub<3>(qureg, swapMap, target_m, buffer); + else if (k == 4) cpu_statevec_unpackFusedMultiSwapBuffers_sub<4>(qureg, swapMap, target_m, buffer); + else if (k == 5) cpu_statevec_unpackFusedMultiSwapBuffers_sub<5>(qureg, swapMap, target_m, buffer); + else cpu_statevec_unpackFusedMultiSwapBuffers_sub<-1>(qureg, swapMap, target_m, buffer); +} + /* * SWAPS diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp index 3dbae057b..1d663bccc 100644 --- a/quest/src/cpu/cpu_subroutines.hpp +++ b/quest/src/cpu/cpu_subroutines.hpp @@ -16,6 +16,7 @@ #include "quest/src/core/utilities.hpp" #include +#include using std::vector; @@ -48,6 +49,9 @@ template qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, Con qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2); +void cpu_statevec_packFusedMultiSwapBuffers(Qureg qureg, const std::map& swapMap, int target_m, qcomp* buffer); + +void cpu_statevec_unpackFusedMultiSwapBuffers(Qureg qureg, const std::map& swapMap, int target_m, qcomp* buffer); /* * SWAPS