diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..6c1b52726 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,70 @@ +# Benchmark driver for issue #749 — small GPU allocations + +> **Note for maintainers:** this directory is a throw-away benchmarking aid shared +> at the reviewer's request (PR #783). It is **not** wired into CMake/CI and can be +> deleted before merge — the squash will keep it out of the permanent history. + +`benchmark_749.cpp` times the two multi-qubit GPU operations targeted by #749, which +previously copied a qubit-index list to the device (`cudaMalloc` + `cudaMemcpyAsync` ++ `cudaFree`) on *every* call: + +- `applyMultiQubitProjector` → `*_multiQubitProjector_sub` +- `calcProbOfMultiQubitOutcome` → `*_calcProbOfMultiQubitOutcome_sub` + +It forces the single-GPU path (`useGpuAccel=1`, distribution/multithreading off), +calls `syncQuESTEnv()` around each timed region (so we measure *completed* GPU work), +and prints per-call wall time (µs) as CSV. + +## Build & run + +It compiles against QuEST through the built-in `USER_SOURCE_NAMES` mechanism — no +extra include/link flags needed: + +```bash +# from a QuEST checkout (this branch): +cmake -S . -B build_bench \ + -D QUEST_ENABLE_CUDA=ON \ + -D CMAKE_CUDA_ARCHITECTURES=120 \ + -D CMAKE_BUILD_TYPE=Release \ + -D USER_SOURCE_NAMES=benchmarks/benchmark_749.cpp \ + -D USER_OUTPUT_EXE_NAME=bench_749 +cmake --build build_bench --target bench_749 -j + +# args: [minQubits=4] [maxQubits=20] [numTargs=3] [reps=2000] +./build_bench/bench_749 4 20 3 2000 +``` + +To get the before/after numbers, build the same file once against a clean +`origin/devel` checkout (baseline) and once against this branch (optimised). + +For the CUDA-API counts, wrap a shorter run under Nsight Systems and tally the +runtime API calls, e.g.: + +```bash +nsys profile --trace=cuda -o trace_749 ./build_bench/bench_749 8 12 3 200 +nsys stats --report cuda_api_sum trace_749.nsys-rep # cudaMalloc/Free/MemcpyAsync/LaunchKernel +``` + +## Results on the author's machine (RTX PRO 6000 Blackwell, CUDA 13.0, sm_120) + +CUDA runtime API call counts (N = 8…12, 200 reps, both ops): + +| CUDA runtime call | baseline | optimised | change | +|---|--:|--:|--:| +| `cudaMalloc` | 3020 | 1010 | −66% | +| `cudaFree` | 3021 | 1011 | −66% | +| `cudaMemcpyAsync` | 3015 | 1005 | −67% | +| `cudaLaunchKernel`| 2025 | 2025 | unchanged | + +Per-call wall time (µs), `numTargs = 3`, 2000 reps: + +| N | projector base | projector opt | speedup | prob base | prob opt | speedup | +|--:|--:|--:|--:|--:|--:|--:| +| 4 | 12.41 | 6.48 | 1.92× | 20.35 | 14.59 | 1.39× | +| 8 | 11.74 | 6.40 | 1.83× | 19.44 | 14.18 | 1.37× | +| 12 | 11.85 | 6.52 | 1.82× | 19.79 | 14.44 | 1.37× | +| 16 | 12.22 | 6.84 | 1.79× | 28.63 | 23.21 | 1.23× | +| 20 | 58.41 | 13.74 | 4.25× | 67.99 | 61.99 | 1.10× | + +The residual ~1000 `cudaMalloc` in the optimised build is `thrust::reduce`'s own +internal temporary inside `calcProb` (inherent to Thrust, out of scope). diff --git a/benchmarks/benchmark_749.cpp b/benchmarks/benchmark_749.cpp new file mode 100644 index 000000000..b7c6a3ccc --- /dev/null +++ b/benchmarks/benchmark_749.cpp @@ -0,0 +1,101 @@ +/* benchmark_749.cpp + * + * Micro-benchmark for QuEST issue #749 ("Profile and optimise-away small GPU + * allocations"). It times the two families of multi-qubit GPU operations that + * previously copied a qubit-index list to the device on every call: + * + * - applyMultiQubitProjector() -> thrust_*_multiQubitProjector_sub + * - calcProbOfMultiQubitOutcome() -> thrust_*_calcProbOfMultiQubitOutcome_sub + * + * For a sweep of (small) Qureg sizes it reports the mean wall-clock time per + * call (in microseconds) as CSV on stdout. Build it once against the baseline + * (unmodified origin/devel) and once against the optimised branch; the + * accompanying analyze.py compares the two CSVs and plots the speedup. + * + * The GPU is forced on (useGpuAccel=1) and distribution/multithreading are off, + * so we isolate the single-GPU code path. syncQuESTEnv() is called before each + * timed region so we measure completed GPU work, not just async dispatch. + * + * Usage: ./bench_749 [minQubits] [maxQubits] [numTargs] [reps] + * defaults: minQubits=4 maxQubits=20 numTargs=3 reps=2000 + */ + +#include "quest.h" + +#include +#include +#include +#include + +using clk = std::chrono::high_resolution_clock; + +static double timeProjector(Qureg qureg, const std::vector& qubits, + const std::vector& outcomes, int reps) { + + // collapse once so subsequent identical projections are well-defined (prob=1) + initPlusState(qureg); + applyMultiQubitProjector(qureg, qubits, outcomes); + syncQuESTEnv(); + + auto t0 = clk::now(); + for (int r = 0; r < reps; r++) + applyMultiQubitProjector(qureg, qubits, outcomes); + syncQuESTEnv(); + auto t1 = clk::now(); + + double ns = std::chrono::duration(t1 - t0).count(); + return ns / reps / 1e3; // microseconds per call +} + +static double timeProbOfOutcome(Qureg qureg, const std::vector& qubits, + const std::vector& outcomes, int reps) { + + initPlusState(qureg); + volatile qreal sink = 0; + sink += calcProbOfMultiQubitOutcome(qureg, qubits, outcomes); + syncQuESTEnv(); + + auto t0 = clk::now(); + for (int r = 0; r < reps; r++) + sink += calcProbOfMultiQubitOutcome(qureg, qubits, outcomes); + syncQuESTEnv(); + auto t1 = clk::now(); + + double ns = std::chrono::duration(t1 - t0).count(); + (void) sink; + return ns / reps / 1e3; // microseconds per call +} + +int main(int argc, char** argv) { + + int minQubits = (argc > 1) ? atoi(argv[1]) : 4; + int maxQubits = (argc > 2) ? atoi(argv[2]) : 20; + int numTargs = (argc > 3) ? atoi(argv[3]) : 3; + int reps = (argc > 4) ? atoi(argv[4]) : 2000; + + initQuESTEnv(); + + // CSV header (printed once; '#' lines are ignored by analyze.py) + printf("numQubits,numTargs,proj_us,prob_us\n"); + + for (int n = minQubits; n <= maxQubits; n++) { + + // statevector forced onto the GPU only + Qureg qureg = createCustomQureg(n, 0, /*distrib*/0, /*gpu*/1, /*mt*/0); + + std::vector qubits(numTargs), outcomes(numTargs, 0); + for (int i = 0; i < numTargs; i++) + qubits[i] = i; + + double proj = timeProjector(qureg, qubits, outcomes, reps); + double prob = timeProbOfOutcome(qureg, qubits, outcomes, reps); + + printf("%d,%d,%.4f,%.4f\n", n, numTargs, proj, prob); + fflush(stdout); + + destroyQureg(qureg); + } + + finalizeQuESTEnv(); + return 0; +} diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp index 677e6c74a..89c0a920f 100644 --- a/quest/src/core/accelerator.cpp +++ b/quest/src/core/accelerator.cpp @@ -1087,13 +1087,17 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa void accel_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) { - GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_multiQubitProjector_sub, qureg, qubits.size() ); - func(qureg, qubits, outcomes, prob); + // the projector uses a bitmask reformulation (see issue #749) which needs no + // numTargs-templated dispatch, so we branch directly between the CPU and GPU backends + (qureg.isGpuAccelerated)? + gpu_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob) : + cpu_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob); } void accel_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) { - GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_multiQubitProjector_sub, qureg, qubits.size() ); - func(qureg, qubits, outcomes, prob); + (qureg.isGpuAccelerated)? + gpu_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, prob) : + cpu_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, prob); } diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 59df946e9..7cedcd930 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2562,11 +2562,9 @@ template qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg, F */ -template void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) { // all qubits are in suffix - assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits); // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); @@ -2574,26 +2572,21 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const // visit every amp, setting to zero or multiplying it by renorm qindex numIts = qureg.numAmpsPerNode; - // binary value of targeted qubits in basis states which are to be retained - qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); + // use primitive bitmasks instead of a per-qubit loop (see issue #749); the test + // getValueOfBits(n,qubits)==retainValue is exactly (n & qubitMask) == valueMask + qindex qubitMask = util_getBitMask(qubits); + qindex valueMask = util_getBitMask(qubits, outcomes); qreal renorm = 1 / std::sqrt(prob); - // use template param to compile-time unroll loop in getValueOfBits() - SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); - #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) { // this function is merely an optimisation to avoid calling the above @@ -2601,7 +2594,6 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const // pre- and post-multiply projector versions DO just call above. // qubits are unconstrained, and can include prefix qubits - assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits); // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); @@ -2609,13 +2601,12 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const // visit every amp, setting most to zero and multiplying the remainder by renorm qindex numIts = qureg.numAmpsPerNode; - // binary value of targeted qubits in basis states which are to be retained - qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); + // use primitive bitmasks instead of a per-qubit loop (see issue #749); the original + // test (v1==v2)&&(retainValue==v1) is exactly (r & qubitMask)==valueMask && (c & qubitMask)==valueMask + qindex qubitMask = util_getBitMask(qubits); + qindex valueMask = util_getBitMask(qubits, outcomes); qreal renorm = 1 / prob; - // use template param to compile-time unroll loops in getValueOfBits() - SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); - #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n qcomp cpu_densmatr_calcExpecFullStateD * PROJECTORS */ -template void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); -template void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); +void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); +void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); /* diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 9b8e819b5..296fa6704 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -1861,21 +1861,19 @@ template qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg, F */ -template void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) { - // all qubits are in suffix - assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits); + // all qubits are in suffix; the bitmask reformulation (see issue #749) needs no + // per-numTargs template specialisation, so this function is no longer templated #if QUEST_COMPILE_CUQUANTUM - // cuQuantum disregards NumQubits template param cuquantum_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob); #elif QUEST_COMPILE_CUDA qreal renorm = 1 / std::sqrt(prob); - thrust_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, renorm); + thrust_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, renorm); #else error_gpuSimButGpuNotCompiled(); @@ -1883,16 +1881,14 @@ void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const } -template void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) { // qubits are unconstrained, and can include prefix qubits - assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits); #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM qreal renorm = 1 / prob; - thrust_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, renorm); + thrust_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, renorm); #else error_gpuSimButGpuNotCompiled(); @@ -1900,10 +1896,6 @@ void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const } -INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) ) -INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) ) - - /* * STATE INITIALISATION diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp index 029e0e871..9c6a3013f 100644 --- a/quest/src/gpu/gpu_subroutines.hpp +++ b/quest/src/gpu/gpu_subroutines.hpp @@ -183,8 +183,8 @@ template qcomp gpu_densmatr_calcExpecFullStateD * PROJECTORS */ -template void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); -template void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); +void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); +void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob); /* diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 864cca5f8..2a948750f 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -448,12 +448,18 @@ struct functor_insertBits { // is used to enumerate specific basis-state indices // with qubits in the specified bit values - int* sortedIndsPtr; + // we store the sorted qubit indices by-value inside a trivially-copyable + // List64 (a fixed-size CUDA-compatible array), rather than as a pointer to + // a separately-allocated thrust::device_vector. This eliminates the per-call + // cudaMalloc + cudaMemcpy of the qubit list (see issue #749); the list is + // instead passed directly as a kernel argument and loaded from constant/param + // memory, which is negligibly cheap even for small Quregs. + List64 sortedInds; qindex valueMask; int numBits; - functor_insertBits(int* ptr, qindex mask, int nBits) : - sortedIndsPtr(ptr), valueMask(mask), numBits(nBits) + functor_insertBits(List64 sorted, qindex mask, int nBits) : + sortedInds(sorted), valueMask(mask), numBits(nBits) { assert_numTargsMatchesTemplateParam(nBits, NumBits); } @@ -464,7 +470,7 @@ struct functor_insertBits { SET_VAR_AT_COMPILE_TIME(int, nbits, NumBits, numBits); // return ith local index where bits have the specified values at the specified indices - return insertBitsWithMaskedValues(i, sortedIndsPtr, nbits, valueMask); + return insertBitsWithMaskedValues(i, sortedInds.data(), nbits, valueMask); } }; @@ -508,72 +514,65 @@ struct functor_getFidelityTerm { }; -template struct functor_projectStateVec { - // this functor multiplies an amp with zero or a + // this functor multiplies an amp with zero or a // renormalisation codfficient, depending on whether // the basis state of the amp has qubits in a particular // configuration. This is used to project statevector // qubits into a particular measurement outcome - int* targetsPtr; - int numTargets, rank; - qindex retainValue; + // rather than copying the target-qubit list to the device (incurring a + // cudaMalloc + cudaMemcpy per call; see issue #749), we pass two primitive + // bitmasks which are loaded directly into device registers. The test + // getValueOfBits(n,targs)==retainValue is exactly equivalent to + // (n & qubitMask) == valueMask, where qubitMask flags the target bit + // positions and valueMask holds the desired outcomes at those positions. + qindex qubitMask, valueMask; qreal renorm; functor_projectStateVec( - int* targetsPtr, int numTargets, - qindex retainValue, qreal renorm + qindex qubitMask, qindex valueMask, qreal renorm ) : - targetsPtr(targetsPtr), numTargets(numTargets), - retainValue(retainValue), renorm(renorm) - { - assert_numTargsMatchesTemplateParam(numTargets, NumTargets); - } + qubitMask(qubitMask), valueMask(valueMask), renorm(renorm) + { } __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) { - // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below - SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets); - // return amp scaled by zero or renorm, depending on whether n has projected substate - qindex val = getValueOfBits(n, targetsPtr, numBits); - qreal fac = renorm * (val == retainValue); + qreal fac = renorm * ((n & qubitMask) == valueMask); return fac * amp; } }; -template struct functor_projectDensMatr { - // this functor multiplies an amp with zero or a + // this functor multiplies an amp with zero or a // renormalisation coefficient, depending on whether // the basis state of the amp has qubits in a particular // configuration. This is used to project density matrix // qubits into a particular measurement outcome - int* targetsPtr; - int numTargets, rank, numQuregQubits; - qindex logNumAmpsPerNode, retainValue; + // as with functor_projectStateVec, we avoid copying the target-qubit list to + // the device (see issue #749) by passing primitive bitmasks. The original test + // (v1==v2) && (retainValue==v1) keeps an amp iff both the row and column + // substates equal the requested outcome, which is exactly + // (r & qubitMask)==valueMask && (c & qubitMask)==valueMask. + int rank, numQuregQubits; + qindex logNumAmpsPerNode, qubitMask, valueMask; qreal renorm; functor_projectDensMatr( - int* targetsPtr, int numTargets, int rank, int numQuregQubits, - qindex logNumAmpsPerNode, qindex retainValue, qreal renorm + qindex qubitMask, qindex valueMask, int rank, int numQuregQubits, + qindex logNumAmpsPerNode, qreal renorm ) : - targetsPtr(targetsPtr), numTargets(numTargets), rank(rank), numQuregQubits(numQuregQubits), - logNumAmpsPerNode(logNumAmpsPerNode), retainValue(retainValue), renorm(renorm) - { - assert_numTargsMatchesTemplateParam(numTargets, NumTargets); - } + rank(rank), numQuregQubits(numQuregQubits), logNumAmpsPerNode(logNumAmpsPerNode), + qubitMask(qubitMask), valueMask(valueMask), renorm(renorm) + { } __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) { - // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below - SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets); - // i = global index of nth local amp qindex i = concatenateBits(rank, n, logNumAmpsPerNode); @@ -581,11 +580,8 @@ struct functor_projectDensMatr { qindex r = getBitsRightOfIndex(i, numQuregQubits); qindex c = getBitsLeftOfIndex(i, numQuregQubits-1); - qindex v1 = getValueOfBits(r, targetsPtr, numBits); - qindex v2 = getValueOfBits(c, targetsPtr, numBits); - - // multiply amp with renorm or zero if values disagree with given outcomes - qreal fac = renorm * (v1 == v2) * (retainValue == v1); + // multiply amp with renorm or zero if either row/col substate disagrees with outcomes + qreal fac = renorm * ((r & qubitMask) == valueMask) * ((c & qubitMask) == valueMask); return fac * amp; } }; @@ -792,10 +788,11 @@ qreal thrust_densmatr_calcTotalProb_sub(Qureg qureg) { template qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) { - devints sortedQubits = getDevInts(util_getSorted(qubits)); + // pass the sorted qubit list by-value into the functor (no device alloc; see issue #749) + List64 sortedQubits = util_getSorted(qubits); qindex valueMask = util_getBitMask(qubits, outcomes); - auto indFunctor = functor_insertBits(getPtr(sortedQubits), valueMask, qubits.size()); + auto indFunctor = functor_insertBits(sortedQubits, valueMask, qubits.size()); auto probFunctor = functor_getAmpNorm(); auto rawIter = thrust::make_counting_iterator(QINDEX_ZERO); @@ -812,12 +809,11 @@ qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 q template qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) { - // cannot move these into functor_insertBits constructor, since the memory - // would dangle - and we cannot bind deviceints as an attribute - it's host-only! - devints sortedQubits = getDevInts(util_getSorted(qubits)); + // pass the sorted qubit list by-value into the functor (no device alloc; see issue #749) + List64 sortedQubits = util_getSorted(qubits); qindex valueMask = util_getBitMask(qubits, outcomes); - auto basisIndFunctor = functor_insertBits(getPtr(sortedQubits), valueMask, qubits.size()); + auto basisIndFunctor = functor_insertBits(sortedQubits, valueMask, qubits.size()); auto diagIndFunctor = functor_getDiagInd(qureg); auto probFunctor = functor_getAmpReal(); @@ -1015,13 +1011,13 @@ gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateD */ -template void thrust_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal renorm) { - devints devQubits = getDevInts(qubits); - qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); - auto projFunctor = functor_projectStateVec( - getPtr(devQubits), qubits.size(), retainValue, renorm); + // pass primitive bitmasks instead of copying the qubit list to device (see issue #749) + qindex qubitMask = util_getBitMask(qubits); + qindex valueMask = util_getBitMask(qubits, outcomes); + auto projFunctor = functor_projectStateVec( + qubitMask, valueMask, renorm); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto ampIter = getStartPtr(qureg); @@ -1031,14 +1027,14 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Co } -template void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal renorm) { - devints devQubits = getDevInts(qubits); - qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); - auto projFunctor = functor_projectDensMatr( - getPtr(devQubits), qubits.size(), qureg.rank, qureg.numQubits, - qureg.logNumAmpsPerNode, retainValue, renorm); + // pass primitive bitmasks instead of copying the qubit list to device (see issue #749) + qindex qubitMask = util_getBitMask(qubits); + qindex valueMask = util_getBitMask(qubits, outcomes); + auto projFunctor = functor_projectDensMatr( + qubitMask, valueMask, qureg.rank, qureg.numQubits, + qureg.logNumAmpsPerNode, renorm); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto ampIter = getStartPtr(qureg);