The current libfabric counter polling implementation requires a series of separate lock acquisitions fi_cntr(), fi_cntr_err(), and fi_cq_read(), that can be simplified if we only use fi_cnt_wait(). We see improvements in put latency benchmarks when the shmem_transport_ofi_put_quiet() polling code is removed and no change in functionality.
Original:
void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx)
{
...
while (poll_count < shmem_transport_ofi_put_poll_limit ||
shmem_transport_ofi_put_poll_limit < 0) {
success = fi_cntr_read(ctx->put_cntr);
fail = fi_cntr_readerr(ctx->put_cntr);
cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);
shmem_transport_probe();
if (success < cnt && fail == 0) {
SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx);
SPINLOCK_BODY();
SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx);
} else if (fail) {
RAISE_ERROR_MSG("Operations completed in error (%" PRIu64 ")\n", fail);
} else {
SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx);
return;
}
poll_count++;
}
cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);
do {
cnt = cnt_new;
ssize_t ret = fi_cntr_wait(ctx->put_cntr, cnt, -1);
cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);
OFI_CTX_CHECK_ERROR(ctx, ret);
} while (cnt < cnt_new);
shmem_internal_assert(cnt == cnt_new);
...
Simplified:
void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx)
{
...
uint64_t cnt, cnt_new;
cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);
do {
cnt = cnt_new;
ssize_t ret = fi_cntr_wait(ctx->put_cntr, cnt, -1);
cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);
OFI_CTX_CHECK_ERROR(ctx, ret);
} while (cnt < cnt_new);
shmem_internal_assert(cnt == cnt_new);
...
}
The current libfabric counter polling implementation requires a series of separate lock acquisitions
fi_cntr(),fi_cntr_err(), andfi_cq_read(), that can be simplified if we only usefi_cnt_wait(). We see improvements in put latency benchmarks when theshmem_transport_ofi_put_quiet()polling code is removed and no change in functionality.Original:
Simplified: