diff --git a/quest/src/cpu/CMakeLists.txt b/quest/src/cpu/CMakeLists.txt index 76429407d..5bc2eea79 100644 --- a/quest/src/cpu/CMakeLists.txt +++ b/quest/src/cpu/CMakeLists.txt @@ -4,4 +4,14 @@ target_sources(QuEST PRIVATE cpu_config.cpp cpu_subroutines.cpp -) \ No newline at end of file +) + +# issue #598: optionally use compensated (Kahan) summation in applyCompMatr's +# dense-matrix CPU subroutine. Off by default (naive summation); enable with +# -DQUEST_COMPENSATE_DENSEMATR_SUM=ON +# to trade ~2-7x runtime for ~25x better accuracy on many-target matrices. +option(QUEST_COMPENSATE_DENSEMATR_SUM "Use compensated (Kahan) summation in applyCompMatr's CPU subroutine (issue #598)" OFF) +if (QUEST_COMPENSATE_DENSEMATR_SUM) + target_compile_definitions(QuEST PRIVATE QUEST_COMPENSATED_DENSEMATR_SUM=1) + message(STATUS "Compensated (Kahan) dense-matrix summation is turned ON.") +endif() \ No newline at end of file diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 59df946e9..6672cae57 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -607,8 +607,14 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs.data(), numTargBits, k); // loop may be unrolled - amps[i] = getCpuQcomp(0, 0); - + + // accumulate the row-vector inner-product (matr[k] . cache) in a + // thread-private scalar, writing to memory once at the end + cpu_qcomp sum = getCpuQcomp(0, 0); + #if defined(QUEST_COMPENSATED_DENSEMATR_SUM) + cpu_qcomp comp = getCpuQcomp(0, 0); // running Kahan compensation + #endif + // loop may be unrolled for (qindex j=0; j