Skip to content

Commit c826f6a

Browse files
committed
ITS: make parallelisation for final track fit more consistent
with the rest of the code and let TBB decide on the grain size. Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent 35db565 commit c826f6a

2 files changed

Lines changed: 39 additions & 33 deletions

File tree

Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,21 @@ constexpr float MB = KB * KB;
2727
constexpr float GB = MB * KB;
2828
constexpr bool DoTimeBenchmarks = true;
2929
constexpr bool SaveTimeBenchmarks = false;
30-
constexpr float Tolerance = 1e-12; // numerical tolerance
31-
constexpr int ClustersPerCell = 3; // number of clusters for a cell
32-
constexpr float MaxTrackSeedQ2Pt = 1.e3f; // maximum q/pt for track seeds
33-
constexpr int UnusedIndex = -1; // global unused flag
34-
constexpr float UnsetValue = -999.f; // global unset value
35-
constexpr float Radl = 9.36f; // Radiation length of Si [cm]
36-
constexpr float Rho = 2.33f; // Density of Si [g/cm^3]
37-
constexpr int MaxIter = 4; // Max. supported iterations
38-
constexpr int MaxSelectedTrackletsPerCluster = 100; // vertexer: max lines per cluster
39-
constexpr int GPUBlocks = 60; // default CUDA/HIP launch blocks
40-
constexpr int GPUThreads = 256; // default CUDA/HIP launch threads
41-
constexpr int GPUThreadsTotal = GPUBlocks * GPUThreads;
30+
constexpr float Tolerance = 1e-12; // numerical tolerance
31+
constexpr int ClustersPerCell = 3; // number of clusters for a cell
32+
constexpr int UnusedIndex = -1; // global unused flag
33+
constexpr float UnsetValue = -999.f; // global unset value
34+
constexpr float Radl = 9.36f; // Radiation length of Si [cm]
35+
constexpr float Rho = 2.33f; // Density of Si [g/cm^3]
36+
constexpr int MaxIter = 4; // Max. supported iterations
37+
constexpr int MaxSelectedTrackletsPerCluster = 100; // vertexer: max lines per cluster
38+
constexpr int NumberOfConcurrentSeeds = 16; // default split per worker for the final track fit/extraploation step
39+
constexpr int MinNumberOfConcurrentSeeds = (1 << 8); // minimum chunk size for a worker for the final track fit/extraploation step
40+
constexpr int MaxNumberOfConcurrentSeeds = (1 << 12); // maximum chunk size for a worker for the final track fit/extraploation step
41+
constexpr float MaxTrackSeedQ2Pt = 1.e3f; // maximum q/pt for track seeds
42+
constexpr int GPUBlocks = 60; // default CUDA/HIP launch blocks
43+
constexpr int GPUThreads = 256; // default CUDA/HIP launch threads
44+
constexpr int GPUThreadsTotal = GPUBlocks * GPUThreads; // default CUDA/HIP total launched threads
4245

4346
namespace helpers
4447
{

Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
#include <algorithm>
1717
#include <array>
18-
#include <atomic>
1918
#include <iterator>
2019
#include <mutex>
2120
#include <ranges>
@@ -25,6 +24,7 @@
2524

2625
#include <oneapi/tbb/blocked_range.h>
2726
#include <oneapi/tbb/enumerable_thread_specific.h>
27+
#include <oneapi/tbb/parallel_for.h>
2828

2929
#include "DetectorsBase/Propagator.h"
3030
#include "GPUCommonMath.h"
@@ -812,31 +812,34 @@ void TrackerTraits<NLayers>::findRoads(const int iteration)
812812
bounded_vector<TrackITSExt> tracks(mMemoryPool.get());
813813
mTaskArena->execute([&] {
814814
const int nSeeds = static_cast<int>(trackSeeds.size());
815-
const int nWorkers = std::min(static_cast<int>(mTaskArena->max_concurrency()), nSeeds);
816-
const int chunkSize = std::min(nSeeds, std::clamp(nSeeds / (16 * nWorkers), 256, 4096));
817-
std::atomic<int> nextSeed{0};
815+
const int maxConcurrency = std::max(1, mTaskArena->max_concurrency());
816+
const int chunkSize = std::min(nSeeds, std::clamp(nSeeds / (constants::NumberOfConcurrentSeeds * maxConcurrency), constants::MinNumberOfConcurrentSeeds, constants::MaxNumberOfConcurrentSeeds)); // acts as memory bound and minimum work
817+
818+
// flush local track vector to global vector on reaching chunkSize
818819
std::mutex tracksMutex;
819-
tbb::parallel_for(0, nWorkers, [&](const int) {
820+
auto flushTracks = [&](bounded_vector<TrackITSExt>& localTracks) {
821+
if (localTracks.empty()) {
822+
return;
823+
}
824+
std::lock_guard lock{tracksMutex};
825+
tracks.insert(tracks.end(), std::make_move_iterator(localTracks.begin()), std::make_move_iterator(localTracks.end()));
826+
localTracks.clear();
827+
};
828+
829+
// each worker works on its own range
830+
tbb::parallel_for(tbb::blocked_range<int>(0, nSeeds, chunkSize), [&](const auto& range) {
820831
bounded_vector<TrackITSExt> localTracks(mMemoryPool.get());
821-
localTracks.reserve(chunkSize);
822-
while (true) {
823-
const int firstSeed = nextSeed.fetch_add(chunkSize, std::memory_order_relaxed);
824-
if (firstSeed >= nSeeds) {
825-
break;
826-
}
827-
const int lastSeed = std::min(firstSeed + chunkSize, nSeeds);
828-
for (int iSeed{firstSeed}; iSeed < lastSeed; ++iSeed) {
829-
TrackITSExt temporaryTrack;
830-
if (finaliseTrackSeed(trackSeeds[iSeed], temporaryTrack, iteration, tfInfos, unsortedClusters, propagator)) {
831-
localTracks.push_back(temporaryTrack);
832-
}
832+
localTracks.reserve(std::min(chunkSize, static_cast<int>(range.size())));
833+
for (int iSeed{range.begin()}; iSeed < range.end(); ++iSeed) {
834+
TrackITSExt temporaryTrack;
835+
if (finaliseTrackSeed(trackSeeds[iSeed], temporaryTrack, iteration, tfInfos, unsortedClusters, propagator)) {
836+
localTracks.push_back(temporaryTrack);
833837
}
834-
if (!localTracks.empty()) {
835-
std::lock_guard lock{tracksMutex};
836-
tracks.insert(tracks.end(), std::make_move_iterator(localTracks.begin()), std::make_move_iterator(localTracks.end()));
837-
localTracks.clear();
838+
if (static_cast<int>(localTracks.size()) == chunkSize) {
839+
flushTracks(localTracks);
838840
}
839841
}
842+
flushTracks(localTracks); // flush remaining
840843
deepVectorClear(localTracks);
841844
});
842845

0 commit comments

Comments
 (0)