diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..6c1b52726
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,70 @@
+# Benchmark driver for issue #749 — small GPU allocations
+
+> **Note for maintainers:** this directory is a throw-away benchmarking aid shared
+> at the reviewer's request (PR #783). It is **not** wired into CMake/CI and can be
+> deleted before merge — the squash will keep it out of the permanent history.
+
+`benchmark_749.cpp` times the two multi-qubit GPU operations targeted by #749, which
+previously copied a qubit-index list to the device (`cudaMalloc` + `cudaMemcpyAsync`
++ `cudaFree`) on *every* call:
+
+- `applyMultiQubitProjector`     → `*_multiQubitProjector_sub`
+- `calcProbOfMultiQubitOutcome`  → `*_calcProbOfMultiQubitOutcome_sub`
+
+It forces the single-GPU path (`useGpuAccel=1`, distribution/multithreading off),
+calls `syncQuESTEnv()` around each timed region (so we measure *completed* GPU work),
+and prints per-call wall time (µs) as CSV.
+
+## Build & run
+
+It compiles against QuEST through the built-in `USER_SOURCE_NAMES` mechanism — no
+extra include/link flags needed:
+
+```bash
+# from a QuEST checkout (this branch):
+cmake -S . -B build_bench \
+    -D QUEST_ENABLE_CUDA=ON \
+    -D CMAKE_CUDA_ARCHITECTURES=120 \
+    -D CMAKE_BUILD_TYPE=Release \
+    -D USER_SOURCE_NAMES=benchmarks/benchmark_749.cpp \
+    -D USER_OUTPUT_EXE_NAME=bench_749
+cmake --build build_bench --target bench_749 -j
+
+# args: [minQubits=4] [maxQubits=20] [numTargs=3] [reps=2000]
+./build_bench/bench_749 4 20 3 2000
+```
+
+To get the before/after numbers, build the same file once against a clean
+`origin/devel` checkout (baseline) and once against this branch (optimised).
+
+For the CUDA-API counts, wrap a shorter run under Nsight Systems and tally the
+runtime API calls, e.g.:
+
+```bash
+nsys profile --trace=cuda -o trace_749 ./build_bench/bench_749 8 12 3 200
+nsys stats --report cuda_api_sum trace_749.nsys-rep   # cudaMalloc/Free/MemcpyAsync/LaunchKernel
+```
+
+## Results on the author's machine (RTX PRO 6000 Blackwell, CUDA 13.0, sm_120)
+
+CUDA runtime API call counts (N = 8…12, 200 reps, both ops):
+
+| CUDA runtime call | baseline | optimised | change |
+|---|--:|--:|--:|
+| `cudaMalloc`      | 3020 | 1010 | −66% |
+| `cudaFree`        | 3021 | 1011 | −66% |
+| `cudaMemcpyAsync` | 3015 | 1005 | −67% |
+| `cudaLaunchKernel`| 2025 | 2025 | unchanged |
+
+Per-call wall time (µs), `numTargs = 3`, 2000 reps:
+
+| N | projector base | projector opt | speedup | prob base | prob opt | speedup |
+|--:|--:|--:|--:|--:|--:|--:|
+| 4  | 12.41 | 6.48 | 1.92× | 20.35 | 14.59 | 1.39× |
+| 8  | 11.74 | 6.40 | 1.83× | 19.44 | 14.18 | 1.37× |
+| 12 | 11.85 | 6.52 | 1.82× | 19.79 | 14.44 | 1.37× |
+| 16 | 12.22 | 6.84 | 1.79× | 28.63 | 23.21 | 1.23× |
+| 20 | 58.41 | 13.74 | 4.25× | 67.99 | 61.99 | 1.10× |
+
+The residual ~1000 `cudaMalloc` in the optimised build is `thrust::reduce`'s own
+internal temporary inside `calcProb` (inherent to Thrust, out of scope).
diff --git a/benchmarks/benchmark_749.cpp b/benchmarks/benchmark_749.cpp
new file mode 100644
index 000000000..b7c6a3ccc
--- /dev/null
+++ b/benchmarks/benchmark_749.cpp
@@ -0,0 +1,101 @@
+/* benchmark_749.cpp
+ *
+ * Micro-benchmark for QuEST issue #749 ("Profile and optimise-away small GPU
+ * allocations"). It times the two families of multi-qubit GPU operations that
+ * previously copied a qubit-index list to the device on every call:
+ *
+ *   - applyMultiQubitProjector()      -> thrust_*_multiQubitProjector_sub
+ *   - calcProbOfMultiQubitOutcome()   -> thrust_*_calcProbOfMultiQubitOutcome_sub
+ *
+ * For a sweep of (small) Qureg sizes it reports the mean wall-clock time per
+ * call (in microseconds) as CSV on stdout. Build it once against the baseline
+ * (unmodified origin/devel) and once against the optimised branch; the
+ * accompanying analyze.py compares the two CSVs and plots the speedup.
+ *
+ * The GPU is forced on (useGpuAccel=1) and distribution/multithreading are off,
+ * so we isolate the single-GPU code path. syncQuESTEnv() is called before each
+ * timed region so we measure completed GPU work, not just async dispatch.
+ *
+ * Usage:  ./bench_749 [minQubits] [maxQubits] [numTargs] [reps]
+ *   defaults: minQubits=4 maxQubits=20 numTargs=3 reps=2000
+ */
+
+#include "quest.h"
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using clk = std::chrono::high_resolution_clock;
+
+static double timeProjector(Qureg qureg, const std::vector<int>& qubits,
+                            const std::vector<int>& outcomes, int reps) {
+
+    // collapse once so subsequent identical projections are well-defined (prob=1)
+    initPlusState(qureg);
+    applyMultiQubitProjector(qureg, qubits, outcomes);
+    syncQuESTEnv();
+
+    auto t0 = clk::now();
+    for (int r = 0; r < reps; r++)
+        applyMultiQubitProjector(qureg, qubits, outcomes);
+    syncQuESTEnv();
+    auto t1 = clk::now();
+
+    double ns = std::chrono::duration<double, std::nano>(t1 - t0).count();
+    return ns / reps / 1e3; // microseconds per call
+}
+
+static double timeProbOfOutcome(Qureg qureg, const std::vector<int>& qubits,
+                                const std::vector<int>& outcomes, int reps) {
+
+    initPlusState(qureg);
+    volatile qreal sink = 0;
+    sink += calcProbOfMultiQubitOutcome(qureg, qubits, outcomes);
+    syncQuESTEnv();
+
+    auto t0 = clk::now();
+    for (int r = 0; r < reps; r++)
+        sink += calcProbOfMultiQubitOutcome(qureg, qubits, outcomes);
+    syncQuESTEnv();
+    auto t1 = clk::now();
+
+    double ns = std::chrono::duration<double, std::nano>(t1 - t0).count();
+    (void) sink;
+    return ns / reps / 1e3; // microseconds per call
+}
+
+int main(int argc, char** argv) {
+
+    int minQubits = (argc > 1) ? atoi(argv[1]) : 4;
+    int maxQubits = (argc > 2) ? atoi(argv[2]) : 20;
+    int numTargs  = (argc > 3) ? atoi(argv[3]) : 3;
+    int reps      = (argc > 4) ? atoi(argv[4]) : 2000;
+
+    initQuESTEnv();
+
+    // CSV header (printed once; '#' lines are ignored by analyze.py)
+    printf("numQubits,numTargs,proj_us,prob_us\n");
+
+    for (int n = minQubits; n <= maxQubits; n++) {
+
+        // statevector forced onto the GPU only
+        Qureg qureg = createCustomQureg(n, 0, /*distrib*/0, /*gpu*/1, /*mt*/0);
+
+        std::vector<int> qubits(numTargs), outcomes(numTargs, 0);
+        for (int i = 0; i < numTargs; i++)
+            qubits[i] = i;
+
+        double proj = timeProjector(qureg, qubits, outcomes, reps);
+        double prob = timeProbOfOutcome(qureg, qubits, outcomes, reps);
+
+        printf("%d,%d,%.4f,%.4f\n", n, numTargs, proj, prob);
+        fflush(stdout);
+
+        destroyQureg(qureg);
+    }
+
+    finalizeQuESTEnv();
+    return 0;
+}
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 677e6c74a..89c0a920f 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -1087,13 +1087,17 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa
 
 void accel_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_multiQubitProjector_sub, qureg, qubits.size() );
-    func(qureg, qubits, outcomes, prob);
+    // the projector uses a bitmask reformulation (see issue #749) which needs no
+    // numTargs-templated dispatch, so we branch directly between the CPU and GPU backends
+    (qureg.isGpuAccelerated)?
+        gpu_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob) :
+        cpu_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 }
 void accel_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_multiQubitProjector_sub, qureg, qubits.size() );
-    func(qureg, qubits, outcomes, prob);
+    (qureg.isGpuAccelerated)?
+        gpu_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, prob) :
+        cpu_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 }
 
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 59df946e9..7cedcd930 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -2562,11 +2562,9 @@ template qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub<false,true >(Qureg, F
  */
 
 
-template <int NumQubits>
 void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // all qubits are in suffix
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
@@ -2574,26 +2572,21 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
     // visit every amp, setting to zero or multiplying it by renorm
     qindex numIts = qureg.numAmpsPerNode;
 
-    // binary value of targeted qubits in basis states which are to be retained
-    qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
+    // use primitive bitmasks instead of a per-qubit loop (see issue #749); the test
+    // getValueOfBits(n,qubits)==retainValue is exactly (n & qubitMask) == valueMask
+    qindex qubitMask = util_getBitMask(qubits);
+    qindex valueMask = util_getBitMask(qubits, outcomes);
     qreal renorm = 1 / std::sqrt(prob);
 
-    // use template param to compile-time unroll loop in getValueOfBits()
-    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        // val = outcomes corresponding to n-th local amp (all qubits are in suffix)
-        qindex val = getValueOfBits(n, qubits.data(), numBits);
-
-        // multiply amp with renorm or zero, if qubit value matches or disagrees
-        amps[n] *= renorm * (val == retainValue);
+        // multiply amp with renorm or zero, depending on whether n's substate matches outcomes
+        amps[n] *= renorm * ((n & qubitMask) == valueMask);
     }
 }
 
 
-template <int NumQubits>
 void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // this function is merely an optimisation to avoid calling the above
@@ -2601,7 +2594,6 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
     // pre- and post-multiply projector versions DO just call above.
 
     // qubits are unconstrained, and can include prefix qubits
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
@@ -2609,13 +2601,12 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
     // visit every amp, setting most to zero and multiplying the remainder by renorm
     qindex numIts = qureg.numAmpsPerNode;
 
-    // binary value of targeted qubits in basis states which are to be retained
-    qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
+    // use primitive bitmasks instead of a per-qubit loop (see issue #749); the original
+    // test (v1==v2)&&(retainValue==v1) is exactly (r & qubitMask)==valueMask && (c & qubitMask)==valueMask
+    qindex qubitMask = util_getBitMask(qubits);
+    qindex valueMask = util_getBitMask(qubits, outcomes);
     qreal renorm = 1 / prob;
 
-    // use template param to compile-time unroll loops in getValueOfBits()
-    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -2626,19 +2617,12 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
         qindex r = getBitsRightOfIndex(i, qureg.numQubits);
         qindex c = getBitsLeftOfIndex(i, qureg.numQubits-1);
 
-        qindex v1 = getValueOfBits(r, qubits.data(), numBits);
-        qindex v2 = getValueOfBits(c, qubits.data(), numBits);
-
-        // multiply amp with renorm or zero if values disagree with given outcomes
-        amps[n] *= renorm * (v1 == v2) * (retainValue == v1);
+        // multiply amp with renorm or zero if either row/col substate disagrees with outcomes
+        amps[n] *= renorm * ((r & qubitMask) == valueMask) * ((c & qubitMask) == valueMask);
     }
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-
-
 
 /*
  * STATE INITIALISATION
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 3dbae057b..f8773fc1c 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -187,8 +187,8 @@ template <bool HasPower, bool UseRealPow> qcomp cpu_densmatr_calcExpecFullStateD
  * PROJECTORS
  */
 
-template <int NumQubits> void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
-template <int NumQubits> void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 9b8e819b5..296fa6704 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -1861,21 +1861,19 @@ template qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub<false,true >(Qureg, F
  */
 
 
-template <int NumQubits> 
 void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    // all qubits are in suffix
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
+    // all qubits are in suffix; the bitmask reformulation (see issue #749) needs no
+    // per-numTargs template specialisation, so this function is no longer templated
 
 #if QUEST_COMPILE_CUQUANTUM
 
-    // cuQuantum disregards NumQubits template param
     cuquantum_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 
 #elif QUEST_COMPILE_CUDA
 
     qreal renorm = 1 / std::sqrt(prob);
-    thrust_statevec_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);
+    thrust_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, renorm);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1883,16 +1881,14 @@ void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
 }
 
 
-template <int NumQubits> 
 void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // qubits are unconstrained, and can include prefix qubits
-    assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
 #if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qreal renorm = 1 / prob;
-    thrust_densmatr_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);
+    thrust_densmatr_multiQubitProjector_sub(qureg, qubits, outcomes, renorm);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1900,10 +1896,6 @@ void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
-
-
 
 /*
  * STATE INITIALISATION
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index 029e0e871..9c6a3013f 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -183,8 +183,8 @@ template <bool HasPower, bool UseRealPow> qcomp gpu_densmatr_calcExpecFullStateD
  * PROJECTORS
  */
 
-template <int NumQubits> void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
-template <int NumQubits> void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 864cca5f8..2a948750f 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -448,12 +448,18 @@ struct functor_insertBits {
     // is used to enumerate specific basis-state indices
     // with qubits in the specified bit values
 
-    int* sortedIndsPtr;
+    // we store the sorted qubit indices by-value inside a trivially-copyable
+    // List64 (a fixed-size CUDA-compatible array), rather than as a pointer to
+    // a separately-allocated thrust::device_vector. This eliminates the per-call
+    // cudaMalloc + cudaMemcpy of the qubit list (see issue #749); the list is
+    // instead passed directly as a kernel argument and loaded from constant/param
+    // memory, which is negligibly cheap even for small Quregs.
+    List64 sortedInds;
     qindex valueMask;
     int numBits;
 
-    functor_insertBits(int* ptr, qindex mask, int nBits) :
-        sortedIndsPtr(ptr), valueMask(mask), numBits(nBits)
+    functor_insertBits(List64 sorted, qindex mask, int nBits) :
+        sortedInds(sorted), valueMask(mask), numBits(nBits)
     {
         assert_numTargsMatchesTemplateParam(nBits, NumBits);
     }
@@ -464,7 +470,7 @@ struct functor_insertBits {
         SET_VAR_AT_COMPILE_TIME(int, nbits, NumBits, numBits);
 
         // return ith local index where bits have the specified values at the specified indices
-        return insertBitsWithMaskedValues(i, sortedIndsPtr, nbits, valueMask);
+        return insertBitsWithMaskedValues(i, sortedInds.data(), nbits, valueMask);
     }
 };
 
@@ -508,72 +514,65 @@ struct functor_getFidelityTerm {
 };
 
 
-template <int NumTargets>
 struct functor_projectStateVec {
 
-    // this functor multiplies an amp with zero or a 
+    // this functor multiplies an amp with zero or a
     // renormalisation codfficient, depending on whether
     // the basis state of the amp has qubits in a particular
     // configuration. This is used to project statevector
     // qubits into a particular measurement outcome
 
-    int* targetsPtr;
-    int numTargets, rank;
-    qindex retainValue;
+    // rather than copying the target-qubit list to the device (incurring a
+    // cudaMalloc + cudaMemcpy per call; see issue #749), we pass two primitive
+    // bitmasks which are loaded directly into device registers. The test
+    // getValueOfBits(n,targs)==retainValue is exactly equivalent to
+    // (n & qubitMask) == valueMask, where qubitMask flags the target bit
+    // positions and valueMask holds the desired outcomes at those positions.
+    qindex qubitMask, valueMask;
     qreal renorm;
 
     functor_projectStateVec(
-        int* targetsPtr, int numTargets, 
-        qindex retainValue, qreal renorm
+        qindex qubitMask, qindex valueMask, qreal renorm
     ) :
-        targetsPtr(targetsPtr), numTargets(numTargets),
-        retainValue(retainValue), renorm(renorm)
-    { 
-        assert_numTargsMatchesTemplateParam(numTargets, NumTargets);
-    }
+        qubitMask(qubitMask), valueMask(valueMask), renorm(renorm)
+    { }
 
     __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) {
 
-        // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below
-        SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets);
-
         // return amp scaled by zero or renorm, depending on whether n has projected substate
-        qindex val = getValueOfBits(n, targetsPtr, numBits);
-        qreal fac = renorm * (val == retainValue);
+        qreal fac = renorm * ((n & qubitMask) == valueMask);
         return fac * amp;
     }
 };
 
 
-template <int NumTargets>
 struct functor_projectDensMatr {
 
-    // this functor multiplies an amp with zero or a 
+    // this functor multiplies an amp with zero or a
     // renormalisation coefficient, depending on whether
     // the basis state of the amp has qubits in a particular
     // configuration. This is used to project density matrix
     // qubits into a particular measurement outcome
 
-    int* targetsPtr;
-    int numTargets, rank, numQuregQubits;
-    qindex logNumAmpsPerNode, retainValue;
+    // as with functor_projectStateVec, we avoid copying the target-qubit list to
+    // the device (see issue #749) by passing primitive bitmasks. The original test
+    // (v1==v2) && (retainValue==v1) keeps an amp iff both the row and column
+    // substates equal the requested outcome, which is exactly
+    // (r & qubitMask)==valueMask && (c & qubitMask)==valueMask.
+    int rank, numQuregQubits;
+    qindex logNumAmpsPerNode, qubitMask, valueMask;
     qreal renorm;
 
     functor_projectDensMatr(
-        int* targetsPtr, int numTargets, int rank, int numQuregQubits,
-        qindex logNumAmpsPerNode, qindex retainValue, qreal renorm
+        qindex qubitMask, qindex valueMask, int rank, int numQuregQubits,
+        qindex logNumAmpsPerNode, qreal renorm
     ) :
-        targetsPtr(targetsPtr), numTargets(numTargets), rank(rank), numQuregQubits(numQuregQubits),
-        logNumAmpsPerNode(logNumAmpsPerNode), retainValue(retainValue), renorm(renorm)
-    { 
-        assert_numTargsMatchesTemplateParam(numTargets, NumTargets);
-    }
+        rank(rank), numQuregQubits(numQuregQubits), logNumAmpsPerNode(logNumAmpsPerNode),
+        qubitMask(qubitMask), valueMask(valueMask), renorm(renorm)
+    { }
 
     __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) {
 
-        // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below
-        SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets);
-
         // i = global index of nth local amp
         qindex i = concatenateBits(rank, n, logNumAmpsPerNode);
 
@@ -581,11 +580,8 @@ struct functor_projectDensMatr {
         qindex r = getBitsRightOfIndex(i, numQuregQubits);
         qindex c = getBitsLeftOfIndex(i, numQuregQubits-1);
 
-        qindex v1 = getValueOfBits(r, targetsPtr, numBits);
-        qindex v2 = getValueOfBits(c, targetsPtr, numBits);
-
-        // multiply amp with renorm or zero if values disagree with given outcomes
-        qreal fac = renorm * (v1 == v2) * (retainValue == v1);
+        // multiply amp with renorm or zero if either row/col substate disagrees with outcomes
+        qreal fac = renorm * ((r & qubitMask) == valueMask) * ((c & qubitMask) == valueMask);
         return fac * amp;
     }
 };
@@ -792,10 +788,11 @@ qreal thrust_densmatr_calcTotalProb_sub(Qureg qureg) {
 template <int NumQubits>
 qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
-    devints sortedQubits = getDevInts(util_getSorted(qubits));
+    // pass the sorted qubit list by-value into the functor (no device alloc; see issue #749)
+    List64 sortedQubits = util_getSorted(qubits);
     qindex valueMask = util_getBitMask(qubits, outcomes);
 
-    auto indFunctor = functor_insertBits<NumQubits>(getPtr(sortedQubits), valueMask, qubits.size());
+    auto indFunctor = functor_insertBits<NumQubits>(sortedQubits, valueMask, qubits.size());
     auto probFunctor = functor_getAmpNorm();
 
     auto rawIter = thrust::make_counting_iterator(QINDEX_ZERO);
@@ -812,12 +809,11 @@ qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 q
 template <int NumQubits>
 qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
-    // cannot move these into functor_insertBits constructor, since the memory
-    // would dangle - and we cannot bind deviceints as an attribute - it's host-only!
-    devints sortedQubits = getDevInts(util_getSorted(qubits));
+    // pass the sorted qubit list by-value into the functor (no device alloc; see issue #749)
+    List64 sortedQubits = util_getSorted(qubits);
     qindex valueMask = util_getBitMask(qubits, outcomes);
 
-    auto basisIndFunctor = functor_insertBits<NumQubits>(getPtr(sortedQubits), valueMask, qubits.size());
+    auto basisIndFunctor = functor_insertBits<NumQubits>(sortedQubits, valueMask, qubits.size());
     auto diagIndFunctor = functor_getDiagInd(qureg);
     auto probFunctor = functor_getAmpReal();
 
@@ -1015,13 +1011,13 @@ gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateD
  */
 
 
-template <int NumQubits>
 void thrust_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal renorm) {
 
-    devints devQubits = getDevInts(qubits);
-    qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
-    auto projFunctor = functor_projectStateVec<NumQubits>(
-        getPtr(devQubits), qubits.size(), retainValue, renorm);
+    // pass primitive bitmasks instead of copying the qubit list to device (see issue #749)
+    qindex qubitMask = util_getBitMask(qubits);
+    qindex valueMask = util_getBitMask(qubits, outcomes);
+    auto projFunctor = functor_projectStateVec(
+        qubitMask, valueMask, renorm);
 
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto ampIter = getStartPtr(qureg);
@@ -1031,14 +1027,14 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Co
 }
 
 
-template <int NumQubits>
 void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal renorm) {
 
-    devints devQubits = getDevInts(qubits);
-    qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
-    auto projFunctor = functor_projectDensMatr<NumQubits>(
-        getPtr(devQubits), qubits.size(), qureg.rank, qureg.numQubits,
-        qureg.logNumAmpsPerNode, retainValue, renorm);
+    // pass primitive bitmasks instead of copying the qubit list to device (see issue #749)
+    qindex qubitMask = util_getBitMask(qubits);
+    qindex valueMask = util_getBitMask(qubits, outcomes);
+    auto projFunctor = functor_projectDensMatr(
+        qubitMask, valueMask, qureg.rank, qureg.numQubits,
+        qureg.logNumAmpsPerNode, renorm);
 
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto ampIter = getStartPtr(qureg);