From 69b694e6158f47c2b54ac4e04d36a2317a62a1d2 Mon Sep 17 00:00:00 2001 From: tzh476 Date: Thu, 4 Jun 2026 18:48:18 +0800 Subject: [PATCH 1/7] perf: use BMI2 bit deposit and extract helpers Change-Id: I4bdf73fd74f7b4d8d699124cdca2c2bd7b292e8b --- quest/src/core/bitwise.hpp | 54 +++++++++++++++- tests/unit/CMakeLists.txt | 3 +- tests/unit/bitwise.cpp | 126 +++++++++++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 tests/unit/bitwise.cpp diff --git a/quest/src/core/bitwise.hpp b/quest/src/core/bitwise.hpp index f5266afa4..d88f17731 100644 --- a/quest/src/core/bitwise.hpp +++ b/quest/src/core/bitwise.hpp @@ -14,6 +14,11 @@ #include #endif +#if defined(__BMI2__) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #include + #define QUEST_USE_BMI2_INTRINSICS +#endif + #include "quest/include/types.h" #include "quest/src/core/inliner.hpp" @@ -116,6 +121,35 @@ INLINE qindex setBit(qindex number, int bitIndex, int bitValue) { } +INLINE bool getBitMaskAndCheckIsIncreasing(qindex* maskPtr, const int* bitIndices, int numIndices) { + + // bitIndices can be arbitrarily ordered, though PEXT requires increasing order + qindex mask = 0; + bool isIncreasing = true; + + for (int i=0; i 0) + isIncreasing = isIncreasing && bitIndices[i-1] < bitIndices[i]; + } + + *maskPtr = mask; + return isIncreasing; +} + + +INLINE qindex getBitMaskOfIndices(const int* bitIndices, int numIndices) { + + qindex mask = 0; + + for (int i=0; i(_pdep_u64(static_cast(number), ~static_cast(mask))); + return bitValue? result | mask : result; +#endif // bitIndices must be strictly increasing for (int i=0; i(_pext_u64(static_cast(number), static_cast(mask))); +#endif + for (int i=0; i(_pdep_u64(static_cast(number), ~static_cast(mask))); +#endif + return mask | insertBits(number, bitInds, numBits, 0); } @@ -379,4 +431,4 @@ INLINE void setToBitsOfInteger(int* bits, qindex number, int numBits) { -#endif // BITWISE_HPP \ No newline at end of file +#endif // BITWISE_HPP diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 59341759f..7e689de33 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -2,6 +2,7 @@ target_sources(tests PUBLIC + bitwise.cpp calculations.cpp channels.cpp debug.cpp @@ -16,4 +17,4 @@ target_sources(tests qureg.cpp trotterisation.cpp types.cpp -) \ No newline at end of file +) diff --git a/tests/unit/bitwise.cpp b/tests/unit/bitwise.cpp new file mode 100644 index 000000000..80a0fc5dd --- /dev/null +++ b/tests/unit/bitwise.cpp @@ -0,0 +1,126 @@ +/** @file + * Unit tests of internal bitwise helpers. + * + * @defgroup unitbitwise Bitwise + * @ingroup unittests + */ + +#include "quest/src/core/bitwise.hpp" + +#include + +#include "tests/utils/macros.hpp" + + + +/* + * UTILITIES + */ + +#define TEST_CATEGORY \ + LABEL_UNIT_TAG "[bitwise]" + + +static qindex getReferenceInsertBits(qindex number, const int* bitIndices, int numIndices, int bitValue) { + + for (int i=0; i Date: Fri, 5 Jun 2026 03:58:10 +0800 Subject: [PATCH 2/7] fix: simplify bmi2 intrinsic guards Change-Id: I1fe81d656fe85b0c893e4baf0562c871bc275b81 --- quest/src/core/bitwise.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/quest/src/core/bitwise.hpp b/quest/src/core/bitwise.hpp index d88f17731..e6053572d 100644 --- a/quest/src/core/bitwise.hpp +++ b/quest/src/core/bitwise.hpp @@ -199,7 +199,7 @@ INLINE int getBitMaskParity(qindex mask) { INLINE qindex insertBits(qindex number, const int* bitIndices, int numIndices, int bitValue) { -#if defined(QUEST_USE_BMI2_INTRINSICS) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) +#if defined(QUEST_USE_BMI2_INTRINSICS) qindex mask = getBitMaskOfIndices(bitIndices, numIndices); qindex result = static_cast(_pdep_u64(static_cast(number), ~static_cast(mask))); return bitValue? result | mask : result; @@ -230,7 +230,7 @@ INLINE qindex getValueOfBits(qindex number, const int* bitIndices, int numIndice // bits are arbitrarily ordered, which affects value qindex value = 0; -#if defined(QUEST_USE_BMI2_INTRINSICS) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) +#if defined(QUEST_USE_BMI2_INTRINSICS) qindex mask; bool isIncreasing = getBitMaskAndCheckIsIncreasing(&mask, bitIndices, numIndices); @@ -256,7 +256,7 @@ INLINE qindex getValueOfBits(qindex number, const int* bitIndices, int numIndice INLINE qindex insertBitsWithMaskedValues(qindex number, const int* bitInds, int numBits, qindex mask) { // bitInds must be sorted (increasing), and mask must be zero everywhere except bitInds -#if defined(QUEST_USE_BMI2_INTRINSICS) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) +#if defined(QUEST_USE_BMI2_INTRINSICS) return mask | static_cast(_pdep_u64(static_cast(number), ~static_cast(mask))); #endif From fb20357e4bbfe3d90d34400df62131be1c9b0fe6 Mon Sep 17 00:00:00 2001 From: tzh476 Date: Fri, 5 Jun 2026 04:21:57 +0800 Subject: [PATCH 3/7] test: add bmi2 bitwise benchmark example Change-Id: Iea9e065dafcbd514b16353eb1df9cc0b3b24f879 --- examples/automated/CMakeLists.txt | 6 + examples/automated/benchmark_bmi2_bitwise.cpp | 113 ++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 examples/automated/benchmark_bmi2_bitwise.cpp diff --git a/examples/automated/CMakeLists.txt b/examples/automated/CMakeLists.txt index 5880c2ac0..2fbf257cb 100644 --- a/examples/automated/CMakeLists.txt +++ b/examples/automated/CMakeLists.txt @@ -1,3 +1,9 @@ # @author Tyson Jones add_all_local_examples() + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-mbmi2" QUEST_COMPILER_SUPPORTS_MBMI2) +if (QUEST_COMPILER_SUPPORTS_MBMI2) + target_compile_options(benchmark_bmi2_bitwise_cpp PRIVATE -mbmi2) +endif() diff --git a/examples/automated/benchmark_bmi2_bitwise.cpp b/examples/automated/benchmark_bmi2_bitwise.cpp new file mode 100644 index 000000000..396f7e2fb --- /dev/null +++ b/examples/automated/benchmark_bmi2_bitwise.cpp @@ -0,0 +1,113 @@ +/** @file + * Quick benchmark for BMI2-assisted bit-index helpers. + * + * @author tzh476 + */ + +#include "quest/src/core/bitwise.hpp" + +#include +#include +#include +#include +#include +#include +#include + +static volatile qindex sinkValue = 0; + +template +qindex makeMask(const std::array& indices, qindex pattern) { + qindex mask = 0; + for (size_t i=0; i> i) & 1) + mask |= QINDEX_ONE << indices[i]; + return mask; +} + +template +double benchGet(const std::string& name, const std::array& indices, const std::vector& inputs, qindex ampMask) { + constexpr qindex numIterations = 5000000; + constexpr int numReps = 5; + + size_t inputMask = inputs.size() - 1; + double best = std::numeric_limits::max(); + + for (int r=0; r(0x13579BDF); + auto start = std::chrono::steady_clock::now(); + + for (qindex i=0; i(i) & inputMask] + acc) & ampMask; + acc ^= getValueOfBits(n, indices.data(), static_cast(N)) + (i & 7); + } + + auto end = std::chrono::steady_clock::now(); + sinkValue ^= acc; + + double nsPerCall = std::chrono::duration(end - start).count() / static_cast(numIterations); + best = std::min(best, nsPerCall); + } + + std::cout << std::left << std::setw(30) << name << " " << std::fixed << std::setprecision(3) << best << " ns/call\n"; + return best; +} + +template +double benchInsert(const std::string& name, const std::array& indices, const std::vector& inputs, qindex valueMask, qindex insertedMask) { + constexpr qindex numIterations = 5000000; + constexpr int numReps = 5; + + size_t inputMask = inputs.size() - 1; + double best = std::numeric_limits::max(); + + for (int r=0; r(0x2468ACE0); + auto start = std::chrono::steady_clock::now(); + + for (qindex i=0; i(i) & inputMask] + acc) & valueMask; + acc ^= insertBitsWithMaskedValues(n, indices.data(), static_cast(N), insertedMask) + (i & 15); + } + + auto end = std::chrono::steady_clock::now(); + sinkValue ^= acc; + + double nsPerCall = std::chrono::duration(end - start).count() / static_cast(numIterations); + best = std::min(best, nsPerCall); + } + + std::cout << std::left << std::setw(30) << name << " " << std::fixed << std::setprecision(3) << best << " ns/call\n"; + return best; +} + +int main() { +#if defined(QUEST_USE_BMI2_INTRINSICS) + std::cout << "BMI2 intrinsics: enabled\n"; +#else + std::cout << "BMI2 intrinsics: disabled\n"; +#endif + + std::vector inputs(1 << 15); + qindex state = static_cast(0x123456789ABCDEFULL); + for (qindex& input : inputs) { + state = state * static_cast(0x5851F42D4C957F2DULL) + static_cast(0x14057B7EF767814FULL); + input = state; + } + + qindex nineQubitMask = (QINDEX_ONE << 9) - QINDEX_ONE; + const std::array inds2 = {2, 7}; + const std::array inds5 = {0, 2, 4, 6, 8}; + const std::array inds6 = {0, 1, 3, 5, 7, 8}; + + benchGet("getValueOfBits 2 bits", inds2, inputs, nineQubitMask); + benchGet("getValueOfBits 5 bits", inds5, inputs, nineQubitMask); + benchGet("getValueOfBits 6 bits", inds6, inputs, nineQubitMask); + + benchInsert("insertBitsWithMask 2 bits", inds2, inputs, (QINDEX_ONE << 7) - QINDEX_ONE, makeMask(inds2, 0b01)); + benchInsert("insertBitsWithMask 5 bits", inds5, inputs, (QINDEX_ONE << 4) - QINDEX_ONE, makeMask(inds5, 0b10101)); + benchInsert("insertBitsWithMask 6 bits", inds6, inputs, (QINDEX_ONE << 3) - QINDEX_ONE, makeMask(inds6, 0b101011)); + + std::cout << "sink: " << sinkValue << "\n"; + return 0; +} From c030f5297bb44c801d79ca4d235a5b067deb512a Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 4 Jun 2026 23:35:24 -0400 Subject: [PATCH 4/7] Tailor CI to diff --- .github/workflows/audit.yml | 20 +++++++----- .github/workflows/compile.yml | 56 ++++++++++++++++++--------------- .github/workflows/test_free.yml | 8 +++-- 3 files changed, 48 insertions(+), 36 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 0cec48613..9c3cc41f1 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -10,15 +10,19 @@ name: audit + +### DEBUG +### disabled this; no mem-dangerous changes on: - push: - branches: - - main - - devel - pull_request: - branches: - - main - - devel + workflow_dispatch: + # push: + # branches: + # - main + # - devel + # pull_request: + # branches: + # - main + # - devel jobs: diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index c86de84f1..247bdb92c 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -23,6 +23,10 @@ name: compile +### DEBUG +### disabled all but single-CPU + + on: push: branches: @@ -61,13 +65,13 @@ jobs: # compile QuEST with all combinations of below flags matrix: os: [windows-latest, ubuntu-latest, macos-latest] - precision: [1, 2, 4] - omp: [ON, OFF] - mpi: [ON, OFF] - cuda: [ON, OFF] - hip: [ON, OFF] - cuquantum: [ON, OFF] - mpilib: ['', 'mpich', 'ompi', 'impi', 'msmpi'] + precision: [2] #[1, 2, 4] + omp: [OFF] #[ON, OFF] + mpi: [OFF] #[ON, OFF] + cuda: [OFF] #[ON, OFF] + hip: [OFF] #[ON, OFF] + cuquantum: [OFF] #[ON, OFF] + mpilib: [''] #['', 'mpich', 'ompi', 'impi', 'msmpi'] # disable deprecated API on MSVC, and assign unique compilers, # so that we can concisely consult e.g. matrix.compiler=='cl' @@ -240,7 +244,7 @@ jobs: run: > cmake -B ${{ env.build_dir }} -DQUEST_BUILD_EXAMPLES=ON - -DQUEST_BUILD_TESTS=ON + -DQUEST_BUILD_TESTS=OFF -DQUEST_FLOAT_PRECISION=${{ matrix.precision }} -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.deprecated }} -DQUEST_DISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }} @@ -260,24 +264,24 @@ jobs: # run all compiled isolated examples to test for link-time errors, # continuing if any fail (since some deliberately fail) - - name: Run isolated examples (Windows) - if: ${{ matrix.os == 'windows-latest' }} - working-directory: ${{ env.isolated_dir }}/Release/ - shell: pwsh - run: | - Get-ChildItem -Filter '*.exe' -File | - ForEach-Object { - Write-Host "`r`n[[[ $($_.Name) ]]]`r`n" - & $_.FullName - } - - name: Run isolated examples (Unix) - if: ${{ matrix.os != 'windows-latest' }} - working-directory: ${{ env.isolated_dir }} - run: | - for fn in *_c *_cpp; do - printf "\n[[[ $fn ]]]\n" - ./$fn || true - done + # - name: Run isolated examples (Windows) + # if: ${{ matrix.os == 'windows-latest' }} + # working-directory: ${{ env.isolated_dir }}/Release/ + # shell: pwsh + # run: | + # Get-ChildItem -Filter '*.exe' -File | + # ForEach-Object { + # Write-Host "`r`n[[[ $($_.Name) ]]]`r`n" + # & $_.FullName + # } + # - name: Run isolated examples (Unix) + # if: ${{ matrix.os != 'windows-latest' }} + # working-directory: ${{ env.isolated_dir }} + # run: | + # for fn in *_c *_cpp; do + # printf "\n[[[ $fn ]]]\n" + # ./$fn || true + # done # run all compiled 'automated' examples - name: Run automated examples (Windows) diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml index 2d332e842..2fb81492f 100644 --- a/.github/workflows/test_free.yml +++ b/.github/workflows/test_free.yml @@ -10,6 +10,10 @@ name: test (free, serial) +### DEBUG +### disabled all but single-CPU + + on: push: branches: @@ -41,8 +45,8 @@ jobs: # we will compile QuEST with all precisions but no parallelisation matrix: os: [ubuntu-latest, macos-latest, windows-latest] - version: [3, 4] - precision: [1, 2, 4] + version: [4] # [3, 4] + precision: [2] # [1, 2, 4] # MSVC cannot compile deprecated v3 tests exclude: From 06d67ab74714fdf005251f64d9a4c982582d5cb8 Mon Sep 17 00:00:00 2001 From: tzh476 Date: Fri, 5 Jun 2026 11:56:27 +0800 Subject: [PATCH 5/7] ci: tolerate automated example exits on windows Change-Id: Ia987f8b01088101ddbb8f43c180c19f3e07c085c --- .github/workflows/compile.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 247bdb92c..b18fd5acb 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -293,6 +293,10 @@ jobs: ForEach-Object { Write-Host "`r`n[[[ $($_.Name) ]]]`r`n" & $_.FullName + if ($LASTEXITCODE -ne 0) { + Write-Warning "$($_.Name) exited with code $LASTEXITCODE" + $global:LASTEXITCODE = 0 + } } - name: Run automated examples (Unix) if: ${{ matrix.os != 'windows-latest' }} From 849a544783e735f4543cbb43c82f401b4cf7b0fc Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 5 Jun 2026 00:54:52 -0400 Subject: [PATCH 6/7] add MacOS x86 to CI tests --- .github/workflows/compile.yml | 2 +- .github/workflows/test_free.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index b18fd5acb..c66be8dbe 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -64,7 +64,7 @@ jobs: # compile QuEST with all combinations of below flags matrix: - os: [windows-latest, ubuntu-latest, macos-latest] + os: [windows-latest, ubuntu-latest, macos-latest, macos-15-intel, macos-26-intel] precision: [2] #[1, 2, 4] omp: [OFF] #[ON, OFF] mpi: [OFF] #[ON, OFF] diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml index 2fb81492f..0207722df 100644 --- a/.github/workflows/test_free.yml +++ b/.github/workflows/test_free.yml @@ -44,7 +44,7 @@ jobs: # we will compile QuEST with all precisions but no parallelisation matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, macos-latest, windows-latest, macos-15-intel, macos-26-intel] version: [4] # [3, 4] precision: [2] # [1, 2, 4] From daa3045431af9c18fe95a4dccc474887b6d47b27 Mon Sep 17 00:00:00 2001 From: tzh476 Date: Fri, 5 Jun 2026 13:05:20 +0800 Subject: [PATCH 7/7] ci: wire intel macos workflow matrix Change-Id: I54386dfde9f0cf209e54e79c80937170e35e622b --- .github/workflows/compile.yml | 8 +++++++- .github/workflows/test_free.yml | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index c66be8dbe..59b2a4dc0 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -43,7 +43,7 @@ jobs: # test only compilation succeeds (no execution) build-test: name: > - ${{ matrix.os == 'ubuntu-latest' && 'Linux' || matrix.os == 'macos-latest' && 'MacOS' || 'Windows' }} + ${{ matrix.os == 'ubuntu-latest' && 'Linux' || startsWith(matrix.os, 'macos') && 'MacOS' || 'Windows' }} [${{ matrix.precision }}] ${{ matrix.omp == 'ON' && 'OMP' || '' }} ${{ matrix.mpi == 'ON' && 'MPI' || '' }} @@ -84,6 +84,12 @@ jobs: - os: macos-latest compiler: clang++ deprecated: ON + - os: macos-15-intel + compiler: clang++ + deprecated: ON + - os: macos-26-intel + compiler: clang++ + deprecated: ON - os: windows-latest compiler: cl deprecated: OFF diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml index 0207722df..f6c20e1dd 100644 --- a/.github/workflows/test_free.yml +++ b/.github/workflows/test_free.yml @@ -31,7 +31,7 @@ jobs: # excluding the v4 integration tests, for free serial-unit-test: name: > - ${{ matrix.os == 'ubuntu-latest' && 'Linux' || matrix.os == 'macos-latest' && 'MacOS' || 'Windows' }} + ${{ matrix.os == 'ubuntu-latest' && 'Linux' || startsWith(matrix.os, 'macos') && 'MacOS' || 'Windows' }} [${{ matrix.precision }}] serial unit v${{ matrix.version }}