QuEST-Kit · thedaemon-wizard · Jun 7, 2026 · Jun 9, 2026
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,70 @@
+# Benchmark driver for issue #749 — small GPU allocations
+
+> **Note for maintainers:** this directory is a throw-away benchmarking aid shared
+> at the reviewer's request (PR #783). It is **not** wired into CMake/CI and can be
+> deleted before merge — the squash will keep it out of the permanent history.
+
+`benchmark_749.cpp` times the two multi-qubit GPU operations targeted by #749, which
+previously copied a qubit-index list to the device (`cudaMalloc` + `cudaMemcpyAsync`
++ `cudaFree`) on *every* call:
+
+- `applyMultiQubitProjector`     → `*_multiQubitProjector_sub`
+- `calcProbOfMultiQubitOutcome`  → `*_calcProbOfMultiQubitOutcome_sub`
+
+It forces the single-GPU path (`useGpuAccel=1`, distribution/multithreading off),
+calls `syncQuESTEnv()` around each timed region (so we measure *completed* GPU work),
+and prints per-call wall time (µs) as CSV.
+
+## Build & run
+
+It compiles against QuEST through the built-in `USER_SOURCE_NAMES` mechanism — no
+extra include/link flags needed:
+
+```bash
+# from a QuEST checkout (this branch):
+cmake -S . -B build_bench \
+    -D QUEST_ENABLE_CUDA=ON \
+    -D CMAKE_CUDA_ARCHITECTURES=120 \
+    -D CMAKE_BUILD_TYPE=Release \
+    -D USER_SOURCE_NAMES=benchmarks/benchmark_749.cpp \
+    -D USER_OUTPUT_EXE_NAME=bench_749
+cmake --build build_bench --target bench_749 -j
+
+# args: [minQubits=4] [maxQubits=20] [numTargs=3] [reps=2000]
+./build_bench/bench_749 4 20 3 2000
+```
+
+To get the before/after numbers, build the same file once against a clean
+`origin/devel` checkout (baseline) and once against this branch (optimised).
+
+For the CUDA-API counts, wrap a shorter run under Nsight Systems and tally the
+runtime API calls, e.g.:
+
+```bash
+nsys profile --trace=cuda -o trace_749 ./build_bench/bench_749 8 12 3 200
+nsys stats --report cuda_api_sum trace_749.nsys-rep   # cudaMalloc/Free/MemcpyAsync/LaunchKernel
+```
+
+## Results on the author's machine (RTX PRO 6000 Blackwell, CUDA 13.0, sm_120)
+
+CUDA runtime API call counts (N = 8…12, 200 reps, both ops):
+
+| CUDA runtime call | baseline | optimised | change |
+|---|--:|--:|--:|
+| `cudaMalloc`      | 3020 | 1010 | −66% |
+| `cudaFree`        | 3021 | 1011 | −66% |
+| `cudaMemcpyAsync` | 3015 | 1005 | −67% |
+| `cudaLaunchKernel`| 2025 | 2025 | unchanged |
+
+Per-call wall time (µs), `numTargs = 3`, 2000 reps:
+
+| N | projector base | projector opt | speedup | prob base | prob opt | speedup |
+|--:|--:|--:|--:|--:|--:|--:|
+| 4  | 12.41 | 6.48 | 1.92× | 20.35 | 14.59 | 1.39× |
+| 8  | 11.74 | 6.40 | 1.83× | 19.44 | 14.18 | 1.37× |
+| 12 | 11.85 | 6.52 | 1.82× | 19.79 | 14.44 | 1.37× |
+| 16 | 12.22 | 6.84 | 1.79× | 28.63 | 23.21 | 1.23× |
+| 20 | 58.41 | 13.74 | 4.25× | 67.99 | 61.99 | 1.10× |
+
+The residual ~1000 `cudaMalloc` in the optimised build is `thrust::reduce`'s own
+internal temporary inside `calcProb` (inherent to Thrust, out of scope).
diff --git a/benchmarks/benchmark_749.cpp b/benchmarks/benchmark_749.cpp
@@ -0,0 +1,101 @@
+/* benchmark_749.cpp
+ *
+ * Micro-benchmark for QuEST issue #749 ("Profile and optimise-away small GPU
+ * allocations"). It times the two families of multi-qubit GPU operations that
+ * previously copied a qubit-index list to the device on every call:
+ *
+ *   - applyMultiQubitProjector()      -> thrust_*_multiQubitProjector_sub
+ *   - calcProbOfMultiQubitOutcome()   -> thrust_*_calcProbOfMultiQubitOutcome_sub
+ *
+ * For a sweep of (small) Qureg sizes it reports the mean wall-clock time per
+ * call (in microseconds) as CSV on stdout. Build it once against the baseline
+ * (unmodified origin/devel) and once against the optimised branch; the
+ * accompanying analyze.py compares the two CSVs and plots the speedup.
+ *
+ * The GPU is forced on (useGpuAccel=1) and distribution/multithreading are off,
+ * so we isolate the single-GPU code path. syncQuESTEnv() is called before each
+ * timed region so we measure completed GPU work, not just async dispatch.
+ *
+ * Usage:  ./bench_749 [minQubits] [maxQubits] [numTargs] [reps]
+ *   defaults: minQubits=4 maxQubits=20 numTargs=3 reps=2000
+ */
+
+#include "quest.h"
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using clk = std::chrono::high_resolution_clock;
+
+static double timeProjector(Qureg qureg, const std::vector<int>& qubits,
+                            const std::vector<int>& outcomes, int reps) {
+
+    // collapse once so subsequent identical projections are well-defined (prob=1)
+    initPlusState(qureg);
+    applyMultiQubitProjector(qureg, qubits, outcomes);
+    syncQuESTEnv();
+
+    auto t0 = clk::now();
+    for (int r = 0; r < reps; r++)
+        applyMultiQubitProjector(qureg, qubits, outcomes);
+    syncQuESTEnv();
+    auto t1 = clk::now();
+
+    double ns = std::chrono::duration<double, std::nano>(t1 - t0).count();
+    return ns / reps / 1e3; // microseconds per call
+}
+
+static double timeProbOfOutcome(Qureg qureg, const std::vector<int>& qubits,
+                                const std::vector<int>& outcomes, int reps) {
+
+    initPlusState(qureg);
+    volatile qreal sink = 0;
+    sink += calcProbOfMultiQubitOutcome(qureg, qubits, outcomes);
+    syncQuESTEnv();
+
+    auto t0 = clk::now();
+    for (int r = 0; r < reps; r++)
+        sink += calcProbOfMultiQubitOutcome(qureg, qubits, outcomes);
+    syncQuESTEnv();
+    auto t1 = clk::now();
+
+    double ns = std::chrono::duration<double, std::nano>(t1 - t0).count();
+    (void) sink;
+    return ns / reps / 1e3; // microseconds per call
+}
+
+int main(int argc, char** argv) {
+
+    int minQubits = (argc > 1) ? atoi(argv[1]) : 4;
+    int maxQubits = (argc > 2) ? atoi(argv[2]) : 20;
+    int numTargs  = (argc > 3) ? atoi(argv[3]) : 3;
+    int reps      = (argc > 4) ? atoi(argv[4]) : 2000;
+
+    initQuESTEnv();
+
+    // CSV header (printed once; '#' lines are ignored by analyze.py)
+    printf("numQubits,numTargs,proj_us,prob_us\n");
+
+    for (int n = minQubits; n <= maxQubits; n++) {
+
+        // statevector forced onto the GPU only
+        Qureg qureg = createCustomQureg(n, 0, /*distrib*/0, /*gpu*/1, /*mt*/0);
+
+        std::vector<int> qubits(numTargs), outcomes(numTargs, 0);
+        for (int i = 0; i < numTargs; i++)
+            qubits[i] = i;
+
+        double proj = timeProjector(qureg, qubits, outcomes, reps);
+        double prob = timeProbOfOutcome(qureg, qubits, outcomes, reps);
+
+        printf("%d,%d,%.4f,%.4f\n", n, numTargs, proj, prob);
+        fflush(stdout);
+
+        destroyQureg(qureg);
+    }
+
+    finalizeQuESTEnv();
+    return 0;
+}
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
@@ -1087,13 +1087,17 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa
 
 void accel_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_multiQubitProjector_sub, qureg, qubits.size() );
-    func(qureg, qubits, outcomes, prob);
+    // the projector uses a bitmask reformulation (see issue #749) which needs no
+    // numTargs-templated dispatch, so we branch directly between the CPU and GPU backends
+    (qureg.isGpuAccelerated)?
+        gpu_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob) :
+        cpu_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 }
 void accel_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_multiQubitProjector_sub, qureg, qubits.size() );
-    func(qureg, qubits, outcomes, prob);
+    (qureg.isGpuAccelerated)?
+        gpu_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, prob) :
+        cpu_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 }
 
 

diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
@@ -2562,60 +2562,51 @@ template qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub<false,true >(Qureg, F
  */
 
 
-template <int NumQubits>
 void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // all qubits are in suffix
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
 
     // visit every amp, setting to zero or multiplying it by renorm
     qindex numIts = qureg.numAmpsPerNode;
 
-    // binary value of targeted qubits in basis states which are to be retained
-    qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
+    // use primitive bitmasks instead of a per-qubit loop (see issue #749); the test
+    // getValueOfBits(n,qubits)==retainValue is exactly (n & qubitMask) == valueMask
+    qindex qubitMask = util_getBitMask(qubits);
+    qindex valueMask = util_getBitMask(qubits, outcomes);
     qreal renorm = 1 / std::sqrt(prob);
 
-    // use template param to compile-time unroll loop in getValueOfBits()
-    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        // val = outcomes corresponding to n-th local amp (all qubits are in suffix)
-        qindex val = getValueOfBits(n, qubits.data(), numBits);
-
-        // multiply amp with renorm or zero, if qubit value matches or disagrees
-        amps[n] *= renorm * (val == retainValue);
+        // multiply amp with renorm or zero, depending on whether n's substate matches outcomes
+        amps[n] *= renorm * ((n & qubitMask) == valueMask);
     }
 }
 
 
-template <int NumQubits>
 void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // this function is merely an optimisation to avoid calling the above
     // cpu_statevec_multiQubitProjector_sub() twice upon a density matrix;
     // pre- and post-multiply projector versions DO just call above.
 
     // qubits are unconstrained, and can include prefix qubits
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
 
     // visit every amp, setting most to zero and multiplying the remainder by renorm
     qindex numIts = qureg.numAmpsPerNode;
 
-    // binary value of targeted qubits in basis states which are to be retained
-    qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
+    // use primitive bitmasks instead of a per-qubit loop (see issue #749); the original
+    // test (v1==v2)&&(retainValue==v1) is exactly (r & qubitMask)==valueMask && (c & qubitMask)==valueMask
+    qindex qubitMask = util_getBitMask(qubits);
+    qindex valueMask = util_getBitMask(qubits, outcomes);
     qreal renorm = 1 / prob;
 
-    // use template param to compile-time unroll loops in getValueOfBits()
-    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -2626,19 +2617,12 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
         qindex r = getBitsRightOfIndex(i, qureg.numQubits);
         qindex c = getBitsLeftOfIndex(i, qureg.numQubits-1);
 
-        qindex v1 = getValueOfBits(r, qubits.data(), numBits);
-        qindex v2 = getValueOfBits(c, qubits.data(), numBits);
-
-        // multiply amp with renorm or zero if values disagree with given outcomes
-        amps[n] *= renorm * (v1 == v2) * (retainValue == v1);
+        // multiply amp with renorm or zero if either row/col substate disagrees with outcomes
+        amps[n] *= renorm * ((r & qubitMask) == valueMask) * ((c & qubitMask) == valueMask);
     }
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-
-
 
 /*
  * STATE INITIALISATION

diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
@@ -187,8 +187,8 @@ template <bool HasPower, bool UseRealPow> qcomp cpu_densmatr_calcExpecFullStateD
  * PROJECTORS
  */
 
-template <int NumQubits> void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
-template <int NumQubits> void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*

diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
@@ -1861,49 +1861,41 @@ template qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub<false,true >(Qureg, F
  */
 
 
-template <int NumQubits> 
 void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    // all qubits are in suffix
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
+    // all qubits are in suffix; the bitmask reformulation (see issue #749) needs no
+    // per-numTargs template specialisation, so this function is no longer templated
 
 #if QUEST_COMPILE_CUQUANTUM
 
-    // cuQuantum disregards NumQubits template param
     cuquantum_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 
 #elif QUEST_COMPILE_CUDA
 
     qreal renorm = 1 / std::sqrt(prob);
-    thrust_statevec_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);
+    thrust_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, renorm);
 
 #else
     error_gpuSimButGpuNotCompiled();
 #endif
 }
 
 
-template <int NumQubits> 
 void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // qubits are unconstrained, and can include prefix qubits
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qreal renorm = 1 / prob;
-    thrust_densmatr_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);
+    thrust_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, renorm);
 
 #else
     error_gpuSimButGpuNotCompiled();
 #endif
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-
-
 
 /*
  * STATE INITIALISATION

diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
@@ -183,8 +183,8 @@ template <bool HasPower, bool UseRealPow> qcomp gpu_densmatr_calcExpecFullStateD
  * PROJECTORS
  */
 
-template <int NumQubits> void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
-template <int NumQubits> void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*