Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion quest/src/core/accelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,27 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL

// note qubits may incidentally be ctrls or targs; it doesn't matter
GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );

// return the number of packed amps, for caller convenience
return func(qureg, qubits, qubitStates);
}


void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {

// inverse of packing; scatters received sub-buffer into strided local amps where
// the given qubits are in the given states (used by the fused multi-SWAP routine).
// only the CPU path is dispatched; the fused routine restricts itself to non-GPU
// quregs (issue #595 notes the OpenMP logic alone is sufficient), so no GPU kernel
// is needed and the GPU build is left untouched
if (qubitStates.empty())
error_noCtrlsGivenToBufferPacker();

GET_FUNC_OPTIMISED_FOR_ONE_PARAM( func, cpu_statevec_unpackAmpsFromBuffer, qubits.size() );
func(qureg, qubits, qubitStates);
}


qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {

return (qureg.isGpuAccelerated)?
Expand Down
2 changes: 2 additions & 0 deletions quest/src/core/accelerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS

qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);

void accel_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);

qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);


Expand Down
73 changes: 62 additions & 11 deletions quest/src/core/localiser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -900,24 +900,75 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, Cons
// the SWAPs act on unique qubit pairs and so commute.

/// @todo
/// - the sequence of pair-wise full-swaps should be more efficient as a
/// "single" sequence of smaller messages sending amps directly to their
/// final destination node. This could use a new "multiSwap" function.
/// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
/// multiSwap function should use custatevecSwapIndexBits() if local,
/// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, this
/// routine could use custatevecSwapIndexBits() if local, or
/// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// although the latter requires substantially more work like setting up
/// a communicator which may be inelegant alongside our own distribution scheme.

// perform necessary swaps to move all targets into suffix, each of which invokes communication
// collect the non-trivial pairs; each swaps a suffix qubit with a prefix qubit
auto suffixTargs = lists_getEmptyList64();
auto prefixTargs = lists_getEmptyList64();
for (size_t i=0; i<targsA.size(); i++) {

if (targsA[i] == targsB[i])
continue;
suffixTargs.push_back(std::min(targsA[i], targsB[i]));
prefixTargs.push_back(std::max(targsA[i], targsB[i]));
}
int numSwaps = suffixTargs.size();
if (numSwaps == 0)
return;

// the fused routine below targets the uncontrolled, non-GPU case which every internal
// caller currently uses. A controlled multi-SWAP, or a GPU-accelerated Qureg, falls back
// to the per-swap routine (issue #595 notes the OpenMP logic alone is sufficient, so the
// GPU path is left unchanged)
if (!ctrls.empty() || qureg.isGpuAccelerated) {
for (int i=0; i<numSwaps; i++)
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
return;
}

// FUSED multi-SWAP: rather than performing each prefix<->suffix SWAP in turn (which
// wastefully relays an amplitude through intermediate nodes before its final node),
// we send each amplitude directly to its destination node in a single pass. The
// numSwaps disjoint SWAPs compose into one permutation of qubit bits, so an amplitude
// of this node moves to the rank obtained by overwriting each prefix-target rank-bit
// with the value of its partnered suffix-target bit. We enumerate the (up to)
// 2^numSwaps - 1 destination nodes (one per non-empty subset of prefix targets whose
// partnered suffix bit disagrees with this node's rank bit) and, for each, pack +
// exchange + unpack only the amplitudes bound there. The move is an involution
// between paired nodes, so the packed and unpacked amplitudes occupy the same local
// slots. See arXiv:quant-ph/0608239 (SWAP fusion) and arXiv:2311.01512 Sec IV.

std::vector<int> prefBits(numSwaps);
std::vector<int> rankBits(numSwaps);
for (int i=0; i<numSwaps; i++) {
prefBits[i] = util_getPrefixInd(prefixTargs[i], qureg);
rankBits[i] = getBit(qureg.rank, prefBits[i]);
}

// subset 0 are the amplitudes that do not move (all suffix bits already match the
// rank bits), so we skip it and iterate only the communicating subsets
qindex numSubsets = powerOf2(numSwaps);
for (qindex sub=1; sub<numSubsets; sub++) {

// the destination node flips this node's rank bits for the targeted subset, and
// the to-be-sent amplitudes are those whose suffix-target bits match the pattern
auto states = lists_getEmptyList64();
int pairRank = qureg.rank;
for (int i=0; i<numSwaps; i++) {
int inSubset = getBit(sub, i);
states.push_back(inSubset ? !rankBits[i] : rankBits[i]);
if (inSubset)
pairRank = static_cast<int>(flipBit(pairRank, prefBits[i]));
}

int suffixTarg = std::min(targsA[i], targsB[i]);
int prefixTarg = std::max(targsA[i], targsB[i]);
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
// pack the amplitudes bound for pairRank, exchange, and scatter the received
// amplitudes back into those same local slots
qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, suffixTargs, states);
comm_exchangeSubBuffers(qureg, numPacked, pairRank);
accel_statevec_unpackAmpsFromBuffer(qureg, suffixTargs, states);
}
}

Expand Down
43 changes: 42 additions & 1 deletion quest/src/cpu/cpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,49 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )


template <int NumQubits>
void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {

assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);

// this is the inverse of cpu_statevec_packAmpsIntoBuffer; it scatters the received
// contiguous sub-buffer back into the strided local amplitudes where the given qubits
// are in the given states. It generalises anyCtrlSwap_subC to multiple constrained
// qubits, as needed by the fused multi-SWAP routine.

// use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);

/*
// each constrained qubit halves the number of received amps
qindex numIts = qureg.numAmpsPerNode / powerOf2(qubitInds.size());

// received amplitudes begin at the buffer's receive offset
qindex offset = getBufferRecvInd();

auto sortedQubitInds = util_getSorted(qubitInds);
auto qubitStateMask = util_getBitMask(qubitInds, qubitStates);

// use template param to compile-time unroll loop in insertBits()
SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());

#pragma omp parallel for if(qureg.isMultithreaded)
for (qindex n=0; n<numIts; n++) {

// i = nth local index where qubits are in the specified states
qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);

// scatter the contiguous sub-buffer among the strided local amplitudes
amps[i] = buffer[offset + n];
}
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_unpackAmpsFromBuffer, (Qureg, ConstList64, ConstList64) )



/*
* SWAPS
*/

Expand Down
2 changes: 2 additions & 0 deletions quest/src/cpu/cpu_subroutines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp

template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);

template <int NumQubits> void cpu_statevec_unpackAmpsFromBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);

qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);


Expand Down
Loading