|
15 | 15 |
|
16 | 16 | #include <algorithm> |
17 | 17 | #include <array> |
18 | | -#include <atomic> |
19 | 18 | #include <iterator> |
20 | 19 | #include <mutex> |
21 | 20 | #include <ranges> |
|
25 | 24 |
|
26 | 25 | #include <oneapi/tbb/blocked_range.h> |
27 | 26 | #include <oneapi/tbb/enumerable_thread_specific.h> |
| 27 | +#include <oneapi/tbb/parallel_for.h> |
28 | 28 |
|
29 | 29 | #include "DetectorsBase/Propagator.h" |
30 | 30 | #include "GPUCommonMath.h" |
@@ -812,31 +812,34 @@ void TrackerTraits<NLayers>::findRoads(const int iteration) |
812 | 812 | bounded_vector<TrackITSExt> tracks(mMemoryPool.get()); |
813 | 813 | mTaskArena->execute([&] { |
814 | 814 | const int nSeeds = static_cast<int>(trackSeeds.size()); |
815 | | - const int nWorkers = std::min(static_cast<int>(mTaskArena->max_concurrency()), nSeeds); |
816 | | - const int chunkSize = std::min(nSeeds, std::clamp(nSeeds / (16 * nWorkers), 256, 4096)); |
817 | | - std::atomic<int> nextSeed{0}; |
| 815 | + const int maxConcurrency = std::max(1, mTaskArena->max_concurrency()); |
| 816 | + const int chunkSize = std::min(nSeeds, std::clamp(nSeeds / (constants::NumberOfConcurrentSeeds * maxConcurrency), constants::MinNumberOfConcurrentSeeds, constants::MaxNumberOfConcurrentSeeds)); // acts as memory bound and minimum work |
| 817 | + |
| 818 | + // flush local track vector to global vector on reaching chunkSize |
818 | 819 | std::mutex tracksMutex; |
819 | | - tbb::parallel_for(0, nWorkers, [&](const int) { |
| 820 | + auto flushTracks = [&](bounded_vector<TrackITSExt>& localTracks) { |
| 821 | + if (localTracks.empty()) { |
| 822 | + return; |
| 823 | + } |
| 824 | + std::lock_guard lock{tracksMutex}; |
| 825 | + tracks.insert(tracks.end(), std::make_move_iterator(localTracks.begin()), std::make_move_iterator(localTracks.end())); |
| 826 | + localTracks.clear(); |
| 827 | + }; |
| 828 | + |
| 829 | + // each worker works on its own range |
| 830 | + tbb::parallel_for(tbb::blocked_range<int>(0, nSeeds, chunkSize), [&](const auto& range) { |
820 | 831 | bounded_vector<TrackITSExt> localTracks(mMemoryPool.get()); |
821 | | - localTracks.reserve(chunkSize); |
822 | | - while (true) { |
823 | | - const int firstSeed = nextSeed.fetch_add(chunkSize, std::memory_order_relaxed); |
824 | | - if (firstSeed >= nSeeds) { |
825 | | - break; |
826 | | - } |
827 | | - const int lastSeed = std::min(firstSeed + chunkSize, nSeeds); |
828 | | - for (int iSeed{firstSeed}; iSeed < lastSeed; ++iSeed) { |
829 | | - TrackITSExt temporaryTrack; |
830 | | - if (finaliseTrackSeed(trackSeeds[iSeed], temporaryTrack, iteration, tfInfos, unsortedClusters, propagator)) { |
831 | | - localTracks.push_back(temporaryTrack); |
832 | | - } |
| 832 | + localTracks.reserve(std::min(chunkSize, static_cast<int>(range.size()))); |
| 833 | + for (int iSeed{range.begin()}; iSeed < range.end(); ++iSeed) { |
| 834 | + TrackITSExt temporaryTrack; |
| 835 | + if (finaliseTrackSeed(trackSeeds[iSeed], temporaryTrack, iteration, tfInfos, unsortedClusters, propagator)) { |
| 836 | + localTracks.push_back(temporaryTrack); |
833 | 837 | } |
834 | | - if (!localTracks.empty()) { |
835 | | - std::lock_guard lock{tracksMutex}; |
836 | | - tracks.insert(tracks.end(), std::make_move_iterator(localTracks.begin()), std::make_move_iterator(localTracks.end())); |
837 | | - localTracks.clear(); |
| 838 | + if (static_cast<int>(localTracks.size()) == chunkSize) { |
| 839 | + flushTracks(localTracks); |
838 | 840 | } |
839 | 841 | } |
| 842 | + flushTracks(localTracks); // flush remaining |
840 | 843 | deepVectorClear(localTracks); |
841 | 844 | }); |
842 | 845 |
|
|
0 commit comments