From bbd0f5c7a1d3e17f3c510f40191f6d93cc73e3a9 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 20 Sep 2021 10:36:22 -0500 Subject: [PATCH 01/21] test/xfail: remove multi-thread huge send xfails --- test/mpi/maint/jenkins/xfail.conf | 1 - 1 file changed, 1 deletion(-) diff --git a/test/mpi/maint/jenkins/xfail.conf b/test/mpi/maint/jenkins/xfail.conf index dc2c31b6148..09ad90d5a8c 100644 --- a/test/mpi/maint/jenkins/xfail.conf +++ b/test/mpi/maint/jenkins/xfail.conf @@ -35,7 +35,6 @@ * * * ch4:ofi * /^idup_nb/ xfail=ticket3794 test/mpi/threads/comm/testlist * * * ch4:ucx * /^idup_comm_gen/ xfail=ticket3794 test/mpi/threads/comm/testlist * * * ch4:ucx * /^idup_nb/ xfail=ticket3794 test/mpi/threads/comm/testlist -* * * ch4:ofi * /^mt_.*_huge.* env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384/ xfail=ticket5359 test/mpi/threads/pt2pt/testlist ################################################################################ # misc special build * * nofast * * /^large_acc_flush_local/ xfail=issue4663 test/mpi/rma/testlist From 30f1c6e579e829fab186bc0c219665676c8bca22 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 22 Sep 2021 17:23:02 -0500 Subject: [PATCH 02/21] test: add more tests for sending huge message --- test/mpi/errors/pt2pt/testlist | 1 + test/mpi/errors/pt2pt/truncmsg1.c | 8 ++++++++ test/mpi/pt2pt/testlist.in | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/test/mpi/errors/pt2pt/testlist b/test/mpi/errors/pt2pt/testlist index 4e1e629b5a1..e21ed1fd68b 100644 --- a/test/mpi/errors/pt2pt/testlist +++ b/test/mpi/errors/pt2pt/testlist @@ -1,5 +1,6 @@ proberank 1 truncmsg1 2 +truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 truncmsg2 2 truncmsg_mrecv 2 mpiversion=3.0 # multiple completion ests diff --git a/test/mpi/errors/pt2pt/truncmsg1.c b/test/mpi/errors/pt2pt/truncmsg1.c index 7ebe7402808..2328a27bb83 100644 --- a/test/mpi/errors/pt2pt/truncmsg1.c +++ b/test/mpi/errors/pt2pt/truncmsg1.c @@ -83,6 +83,14 @@ int main(int argc, char *argv[]) err = MPI_Recv(buf, LongLen - 1, MPI_INT, source, 0, MPI_COMM_WORLD, &status); errs += checkTruncError(err, "long"); } + /* Test when the receive buffer is much shorter */ + if (rank == source) { + err = MPI_Send(buf, LongLen, MPI_INT, dest, 0, MPI_COMM_WORLD); + errs += checkOk(err, "long"); + } else if (rank == dest) { + err = MPI_Recv(buf, ShortLen, MPI_INT, source, 0, MPI_COMM_WORLD, &status); + errs += checkTruncError(err, "long-receive-short"); + } } free(buf); diff --git a/test/mpi/pt2pt/testlist.in b/test/mpi/pt2pt/testlist.in index 66656e5c02f..9f713342e95 100644 --- a/test/mpi/pt2pt/testlist.in +++ b/test/mpi/pt2pt/testlist.in @@ -52,10 +52,14 @@ waitany_null 1 # perhaps disable in the release tarball large_message 3 mem=6.5 mprobe 2 +mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +mprobe 2 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true big_count_status 1 many_isend 3 manylmt 2 huge_underflow 2 +huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +huge_underflow 2 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true huge_anysrc 2 huge_dupcomm 2 huge_ssend 2 From 9d5c2c958e2fa8b0ec4045906814f34c6bf0227c Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 27 Sep 2021 00:17:21 -0500 Subject: [PATCH 03/21] test: enhance test/mpi/pt2pt/probe_unexp.c Enhance the test by test multiple communicators. This catches the current ofi huge path's incorrect context_id handling. --- test/mpi/pt2pt/probe_unexp.c | 190 ++++++++++++++++++----------------- 1 file changed, 99 insertions(+), 91 deletions(-) diff --git a/test/mpi/pt2pt/probe_unexp.c b/test/mpi/pt2pt/probe_unexp.c index 0dda3336b61..3e867f15d4f 100644 --- a/test/mpi/pt2pt/probe_unexp.c +++ b/test/mpi/pt2pt/probe_unexp.c @@ -17,8 +17,10 @@ char buf[1 << MAX_BUF_SIZE_LG]; * been called. This program may hang if MPI_Probe() does not return when the * message finally arrives (see req #375). */ + int main(int argc, char **argv) { + MPI_Comm comm; int p_size; int p_rank; int msg_size_lg; @@ -27,115 +29,121 @@ int main(int argc, char **argv) MTest_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &p_size); - MPI_Comm_rank(MPI_COMM_WORLD, &p_rank); - /* To improve reporting of problems about operations, we - * change the error handler to errors return */ - MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); - - - for (msg_size_lg = 0; msg_size_lg <= MAX_BUF_SIZE_LG; msg_size_lg++) { - const int msg_size = 1 << msg_size_lg; - int msg_cnt; + while (MTestGetIntracommGeneral(&comm, 2, 1)) { + if (comm == MPI_COMM_NULL) { + continue; + } - MTestPrintfMsg(2, "testing messages of size %d\n", msg_size); - for (msg_cnt = 0; msg_cnt < NUM_MSGS_PER_BUF_SIZE; msg_cnt++) { - MPI_Status status; - const int tag = msg_size_lg * NUM_MSGS_PER_BUF_SIZE + msg_cnt; + MPI_Comm_size(comm, &p_size); + MPI_Comm_rank(comm, &p_rank); + /* To improve reporting of problems about operations, we + * change the error handler to errors return */ + MPI_Comm_set_errhandler(comm, MPI_ERRORS_RETURN); + + + for (msg_size_lg = 0; msg_size_lg <= MAX_BUF_SIZE_LG; msg_size_lg++) { + const int msg_size = 1 << msg_size_lg; + int msg_cnt; + + MTestPrintfMsg(2, "testing messages of size %d\n", msg_size); + for (msg_cnt = 0; msg_cnt < NUM_MSGS_PER_BUF_SIZE; msg_cnt++) { + MPI_Status status; + const int tag = msg_size_lg * NUM_MSGS_PER_BUF_SIZE + msg_cnt; + + MTestPrintfMsg(2, "Message count %d\n", msg_cnt); + if (p_rank == 0) { + int p; + + for (p = 1; p < p_size; p++) { + /* Wait for synchronization message */ + mpi_errno = MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, tag, comm, &status); + if (mpi_errno != MPI_SUCCESS && errs++ < 10) { + MTestPrintError(mpi_errno); + } + + if (status.MPI_TAG != tag && errs++ < 10) { + printf + ("ERROR: unexpected message tag from MPI_Recv(): lp=0, rp=%d, expected=%d, actual=%d, count=%d\n", + status.MPI_SOURCE, status.MPI_TAG, tag, msg_cnt); + } +# if defined(VERBOSE) + { + printf("sending message: p=%d s=%d c=%d\n", + status.MPI_SOURCE, msg_size, msg_cnt); + } +# endif - MTestPrintfMsg(2, "Message count %d\n", msg_cnt); - if (p_rank == 0) { - int p; + /* Send unexpected message which hopefully MPI_Probe() is + * already waiting for at the remote process */ + mpi_errno = MPI_Send(buf, msg_size, MPI_BYTE, + status.MPI_SOURCE, status.MPI_TAG, comm); + if (mpi_errno != MPI_SUCCESS && errs++ < 10) { + MTestPrintError(mpi_errno); + } + } + } else { + int incoming_msg_size; - for (p = 1; p < p_size; p++) { - /* Wait for synchronization message */ - mpi_errno = MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, - tag, MPI_COMM_WORLD, &status); + /* Send synchronization message */ + mpi_errno = MPI_Send(NULL, 0, MPI_BYTE, 0, tag, comm); if (mpi_errno != MPI_SUCCESS && errs++ < 10) { MTestPrintError(mpi_errno); } + /* Perform probe, hopefully before the main process can + * send its reply */ + mpi_errno = MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status); + if (mpi_errno != MPI_SUCCESS && errs++ < 10) { + MTestPrintError(mpi_errno); + } + mpi_errno = MPI_Get_count(&status, MPI_BYTE, &incoming_msg_size); + if (mpi_errno != MPI_SUCCESS && errs++ < 10) { + MTestPrintError(mpi_errno); + } + if (status.MPI_SOURCE != 0 && errs++ < 10) { + printf + ("ERROR: unexpected message source from MPI_Probe(): p=%d, expected=0, actual=%d, count=%d\n", + p_rank, status.MPI_SOURCE, msg_cnt); + } if (status.MPI_TAG != tag && errs++ < 10) { printf - ("ERROR: unexpected message tag from MPI_Recv(): lp=0, rp=%d, expected=%d, actual=%d, count=%d\n", - status.MPI_SOURCE, status.MPI_TAG, tag, msg_cnt); + ("ERROR: unexpected message tag from MPI_Probe(): p=%d, expected=%d, actual=%d, count=%d\n", + p_rank, tag, status.MPI_TAG, msg_cnt); } -# if defined(VERBOSE) - { - printf("sending message: p=%d s=%d c=%d\n", - status.MPI_SOURCE, msg_size, msg_cnt); + if (incoming_msg_size != msg_size && errs++ < 10) { + printf + ("ERROR: unexpected message size from MPI_Probe(): p=%d, expected=%d, actual=%d, count=%d\n", + p_rank, msg_size, incoming_msg_size, msg_cnt); } -# endif - /* Send unexpected message which hopefully MPI_Probe() is - * already waiting for at the remote process */ - mpi_errno = MPI_Send(buf, msg_size, MPI_BYTE, - status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD); + /* Receive the probed message from the main process */ + mpi_errno = MPI_Recv(buf, msg_size, MPI_BYTE, 0, tag, comm, &status); if (mpi_errno != MPI_SUCCESS && errs++ < 10) { MTestPrintError(mpi_errno); } - } - } else { - int incoming_msg_size; - - /* Send synchronization message */ - mpi_errno = MPI_Send(NULL, 0, MPI_BYTE, 0, tag, MPI_COMM_WORLD); - if (mpi_errno != MPI_SUCCESS && errs++ < 10) { - MTestPrintError(mpi_errno); - } - - /* Perform probe, hopefully before the main process can - * send its reply */ - mpi_errno = MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (mpi_errno != MPI_SUCCESS && errs++ < 10) { - MTestPrintError(mpi_errno); - } - mpi_errno = MPI_Get_count(&status, MPI_BYTE, &incoming_msg_size); - if (mpi_errno != MPI_SUCCESS && errs++ < 10) { - MTestPrintError(mpi_errno); - } - if (status.MPI_SOURCE != 0 && errs++ < 10) { - printf - ("ERROR: unexpected message source from MPI_Probe(): p=%d, expected=0, actual=%d, count=%d\n", - p_rank, status.MPI_SOURCE, msg_cnt); - } - if (status.MPI_TAG != tag && errs++ < 10) { - printf - ("ERROR: unexpected message tag from MPI_Probe(): p=%d, expected=%d, actual=%d, count=%d\n", - p_rank, tag, status.MPI_TAG, msg_cnt); - } - if (incoming_msg_size != msg_size && errs++ < 10) { - printf - ("ERROR: unexpected message size from MPI_Probe(): p=%d, expected=%d, actual=%d, count=%d\n", - p_rank, msg_size, incoming_msg_size, msg_cnt); - } - - /* Receive the probed message from the main process */ - mpi_errno = MPI_Recv(buf, msg_size, MPI_BYTE, 0, tag, MPI_COMM_WORLD, &status); - if (mpi_errno != MPI_SUCCESS && errs++ < 10) { - MTestPrintError(mpi_errno); - } - mpi_errno = MPI_Get_count(&status, MPI_BYTE, &incoming_msg_size); - if (mpi_errno != MPI_SUCCESS && errs++ < 10) { - MTestPrintError(mpi_errno); - } - if (status.MPI_SOURCE != 0 && errs++ < 10) { - printf - ("ERROR: unexpected message source from MPI_Recv(): p=%d, expected=0, actual=%d, count=%d\n", - p_rank, status.MPI_SOURCE, msg_cnt); - } - if (status.MPI_TAG != tag && errs++ < 10) { - printf - ("ERROR: unexpected message tag from MPI_Recv(): p=%d, expected=%d, actual=%d, count=%d\n", - p_rank, tag, status.MPI_TAG, msg_cnt); - } - if (incoming_msg_size != msg_size && errs++ < 10) { - printf - ("ERROR: unexpected message size from MPI_Recv(): p=%d, expected=%d, actual=%d, count=%d\n", - p_rank, msg_size, incoming_msg_size, msg_cnt); + mpi_errno = MPI_Get_count(&status, MPI_BYTE, &incoming_msg_size); + if (mpi_errno != MPI_SUCCESS && errs++ < 10) { + MTestPrintError(mpi_errno); + } + if (status.MPI_SOURCE != 0 && errs++ < 10) { + printf + ("ERROR: unexpected message source from MPI_Recv(): p=%d, expected=0, actual=%d, count=%d\n", + p_rank, status.MPI_SOURCE, msg_cnt); + } + if (status.MPI_TAG != tag && errs++ < 10) { + printf + ("ERROR: unexpected message tag from MPI_Recv(): p=%d, expected=%d, actual=%d, count=%d\n", + p_rank, tag, status.MPI_TAG, msg_cnt); + } + if (incoming_msg_size != msg_size && errs++ < 10) { + printf + ("ERROR: unexpected message size from MPI_Recv(): p=%d, expected=%d, actual=%d, count=%d\n", + p_rank, msg_size, incoming_msg_size, msg_cnt); + } } } } + MTestFreeComm(&comm); } MTest_Finalize(errs); From f53eff194372f5e8fa210d6df2d6096d80e6afc5 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 24 Sep 2021 10:29:56 -0500 Subject: [PATCH 04/21] ch4/ofi: remove done_fn in MPIDI_OFI_huge_recv_t Since we always call MPIDI_OFI_recv_event at completion of huge message, we do not need the indirect function pointer. Remove for cleaner code. --- src/mpid/ch4/netmod/ofi/ofi_events.c | 6 ++---- src/mpid/ch4/netmod/ofi/ofi_types.h | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 00520fafa8f..34e4bc1db5a 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -78,7 +78,6 @@ static int peek_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rre recv_elem->remote_info.tag = huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; recv_elem->localreq = huge_list_ptr->rreq = rreq; recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->done_fn = MPIDI_OFI_recv_event; recv_elem->wc = *wc; if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; @@ -220,7 +219,6 @@ static int recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request recv_elem->peek = false; recv_elem->comm_ptr = comm_ptr; recv_elem->localreq = rreq; - recv_elem->done_fn = MPIDI_OFI_recv_event; recv_elem->wc = *wc; if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; @@ -358,11 +356,11 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques } if (bytesToGet == 0ULL && recv_elem->chunks_outstanding == 0) { MPIDI_OFI_send_control_t ctrl; - /* recv_elem->localreq may be freed during done_fn. + /* recv_elem->localreq may be freed during MPIDI_OFI_recv_event. * Need to backup the handle here for later use with MPIDIU_map_erase. */ uint64_t key_to_erase = recv_elem->localreq->handle; recv_elem->wc.len = recv_elem->cur_offset; - recv_elem->done_fn(vni, &recv_elem->wc, recv_elem->localreq, recv_elem->event_id); + MPIDI_OFI_recv_event(vni, &recv_elem->wc, recv_elem->localreq, recv_elem->event_id); ctrl.type = MPIDI_OFI_CTRL_HUGEACK; mpi_errno = MPIDI_OFI_do_control_send(&ctrl, NULL, 0, recv_elem->remote_info.origin_rank, diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index d37206c08bc..42a0c97f7be 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -493,7 +493,6 @@ typedef struct MPIDI_OFI_huge_recv { char pad[MPIDI_REQUEST_HDR_SIZE]; struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ int event_id; /* fixed field, do not move */ - int (*done_fn) (int vni, struct fi_cq_tagged_entry * wc, MPIR_Request * req, int event_id); MPIDI_OFI_send_control_t remote_info; bool peek; /* Flag to indicate whether this struct has been created to track an uncompleted peek * operation. */ From ac2d82b3e779de21862d7a2649de8f83fb4ddfa7 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 24 Sep 2021 11:32:18 -0500 Subject: [PATCH 05/21] ch4/ofi: refactor sending ctrl messages Make MPIDI_OFI_send_control_t internally a union to reflect the general control semantics. Replace MPIDI_OFI_do_control_send with MPIDI_NM_am_send_hdr. --- src/mpid/ch4/netmod/ofi/ofi_control.h | 35 -------------------------- src/mpid/ch4/netmod/ofi/ofi_events.c | 12 ++++++--- src/mpid/ch4/netmod/ofi/ofi_events.h | 1 - src/mpid/ch4/netmod/ofi/ofi_send.h | 36 ++++++++++++++++----------- src/mpid/ch4/netmod/ofi/ofi_types.h | 16 +++++++++--- src/mpid/ch4/netmod/ofi/util.c | 15 ++++------- 6 files changed, 47 insertions(+), 68 deletions(-) delete mode 100644 src/mpid/ch4/netmod/ofi/ofi_control.h diff --git a/src/mpid/ch4/netmod/ofi/ofi_control.h b/src/mpid/ch4/netmod/ofi/ofi_control.h deleted file mode 100644 index 60d8539dbde..00000000000 --- a/src/mpid/ch4/netmod/ofi/ofi_control.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#ifndef OFI_CONTROL_H_INCLUDED -#define OFI_CONTROL_H_INCLUDED - -#include "ofi_am_impl.h" - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_control_send(MPIDI_OFI_send_control_t * control, - char *send_buf, - size_t msgsize, - int rank, MPIR_Comm * comm_ptr, - MPIR_Request * ackreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - control->origin_rank = comm_ptr->rank; - control->send_buf = (uintptr_t) send_buf; - control->msgsize = msgsize; - control->comm_id = comm_ptr->context_id; - control->ackreq = ackreq; - - mpi_errno = MPIDI_OFI_do_inject(rank, comm_ptr, - MPIDI_OFI_INTERNAL_HANDLER_CONTROL, - (void *) control, sizeof(*control), 0, 0); - - MPIR_FUNC_EXIT; - return mpi_errno; -} - - -#endif /* OFI_CONTROL_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 34e4bc1db5a..5ef501c1b33 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -309,7 +309,7 @@ static uintptr_t recv_rbase(MPIDI_OFI_huge_recv_t * recv_elem) if (!MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { return 0; } else { - return recv_elem->remote_info.send_buf; + return (uintptr_t) recv_elem->remote_info.send_buf; } } @@ -362,9 +362,13 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques recv_elem->wc.len = recv_elem->cur_offset; MPIDI_OFI_recv_event(vni, &recv_elem->wc, recv_elem->localreq, recv_elem->event_id); ctrl.type = MPIDI_OFI_CTRL_HUGEACK; - mpi_errno = - MPIDI_OFI_do_control_send(&ctrl, NULL, 0, recv_elem->remote_info.origin_rank, - recv_elem->comm_ptr, recv_elem->remote_info.ackreq); + ctrl.u.huge_ack.ackreq = recv_elem->remote_info.ackreq; + /* note: it's receiver ack sender */ + int vni_remote = recv_elem->remote_info.vni_src; + int vni_local = recv_elem->remote_info.vni_dst; + mpi_errno = MPIDI_NM_am_send_hdr(recv_elem->remote_info.origin_rank, recv_elem->comm_ptr, + MPIDI_OFI_INTERNAL_HANDLER_CONTROL, + &ctrl, sizeof(ctrl), vni_local, vni_remote); MPIR_ERR_CHECK(mpi_errno); MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, key_to_erase); diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h index 79f31916591..ec6dc1f27c5 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.h +++ b/src/mpid/ch4/netmod/ofi/ofi_events.h @@ -9,7 +9,6 @@ #include "ofi_impl.h" #include "ofi_am_impl.h" #include "ofi_am_events.h" -#include "ofi_control.h" #include "utlist.h" int MPIDI_OFI_rma_done_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * in_req); diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 124cd8cbe3c..ea2412cef7a 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -278,8 +278,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou tsenddata, FALSE /* eagain */); MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz); } else if (unlikely(1)) { - MPIDI_OFI_send_control_t ctrl; - int i, num_nics = MPIDI_OFI_global.num_nics; + int num_nics = MPIDI_OFI_global.num_nics; uint64_t rma_keys[MPIDI_OFI_MAX_NICS]; struct fid_mr **huge_send_mrs; uint64_t msg_size = MPIDI_OFI_STRIPE_CHUNK_SIZE; @@ -295,18 +294,17 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou (struct fid_mr **) MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_BUFFER); if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { /* Set up a memory region for the lmt data transfer */ - for (i = 0; i < num_nics; i++) { - ctrl.rma_keys[i] = + for (int i = 0; i < num_nics; i++) { + rma_keys[i] = MPIDI_OFI_mr_key_alloc(MPIDI_OFI_LOCAL_MR_KEY, MPIDI_OFI_INVALID_MR_KEY); - rma_keys[i] = ctrl.rma_keys[i]; } } else { /* zero them to avoid warnings */ - for (i = 0; i < num_nics; i++) { + for (int i = 0; i < num_nics; i++) { rma_keys[i] = 0; } } - for (i = 0; i < num_nics; i++) { + for (int i = 0; i < num_nics; i++) { MPIDI_OFI_CALL(fi_mr_reg(MPIDI_OFI_global.ctx[MPIDI_OFI_get_ctx_index(comm, vni_local, i)].domain, /* In: Domain Object */ send_buf, /* In: Lower memory address */ data_sz, /* In: Length */ @@ -322,8 +320,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou MPL_MEM_BUFFER); if (MPIDI_OFI_ENABLE_MR_PROV_KEY) { /* MR_BASIC */ - for (i = 0; i < num_nics; i++) { - ctrl.rma_keys[i] = fi_mr_key(huge_send_mrs[i]); + for (int i = 0; i < num_nics; i++) { + rma_keys[i] = fi_mr_key(huge_send_mrs[i]); } } @@ -346,14 +344,24 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou vni_local, tsenddata, FALSE /* eagain */); MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], msg_size); MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_sent_bytes_count[sender_nic], msg_size); + + MPIDI_OFI_send_control_t ctrl; ctrl.type = MPIDI_OFI_CTRL_HUGE; - ctrl.seqno = 0; - ctrl.tag = tag; - ctrl.vni_src = vni_src; - ctrl.vni_dst = vni_dst; + for (int i = 0; i < num_nics; i++) { + ctrl.u.huge.rma_keys[i] = rma_keys[i]; + } + ctrl.u.huge.tag = tag; + ctrl.u.huge.vni_src = vni_src; + ctrl.u.huge.vni_dst = vni_dst; + ctrl.u.huge.origin_rank = comm->rank; + ctrl.u.huge.send_buf = send_buf; + ctrl.u.huge.msgsize = data_sz; + ctrl.u.huge.comm_id = comm->context_id; + ctrl.u.huge.ackreq = sreq; /* Send information about the memory region here to get the lmt going. */ - mpi_errno = MPIDI_OFI_do_control_send(&ctrl, send_buf, data_sz, dst_rank, comm, sreq); + mpi_errno = MPIDI_NM_am_send_hdr(dst_rank, comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL, + &ctrl, sizeof(ctrl), vni_src, vni_dst); MPIR_ERR_CHECK(mpi_errno); } diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 42a0c97f7be..741bf92be32 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -386,17 +386,25 @@ typedef struct { } MPIDI_OFI_global_t; typedef struct { - int16_t type; - int16_t seqno; int origin_rank; MPIR_Request *ackreq; - uintptr_t send_buf; + void *send_buf; size_t msgsize; int comm_id; uint64_t rma_keys[MPIDI_OFI_MAX_NICS]; int tag; int vni_src; int vni_dst; +} MPIDI_OFI_huge_remote_info_t; + +typedef struct { + int16_t type; + union { + MPIDI_OFI_huge_remote_info_t huge; + struct { + MPIR_Request *ackreq; + } huge_ack; + } u; } MPIDI_OFI_send_control_t; typedef struct MPIDI_OFI_win_acc_hint { @@ -493,7 +501,7 @@ typedef struct MPIDI_OFI_huge_recv { char pad[MPIDI_REQUEST_HDR_SIZE]; struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ int event_id; /* fixed field, do not move */ - MPIDI_OFI_send_control_t remote_info; + MPIDI_OFI_huge_remote_info_t remote_info; bool peek; /* Flag to indicate whether this struct has been created to track an uncompleted peek * operation. */ size_t cur_offset; diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c index 955e88e7860..3521b455060 100644 --- a/src/mpid/ch4/netmod/ofi/util.c +++ b/src/mpid/ch4/netmod/ofi/util.c @@ -152,7 +152,7 @@ void MPIDI_OFI_mr_key_allocator_destroy(void) /* Translate the control message to get a huge message into a request to * actually perform the data transfer. */ -static int MPIDI_OFI_get_huge(int vni, MPIDI_OFI_send_control_t * info) +static int MPIDI_OFI_get_huge(int vni, MPIDI_OFI_huge_remote_info_t * info) { MPIDI_OFI_huge_recv_t *recv_elem = NULL; int mpi_errno = MPI_SUCCESS; @@ -242,17 +242,12 @@ int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, } switch (ctrlsend->type) { - case MPIDI_OFI_CTRL_HUGEACK:{ - /* FIXME: need vni from the callback parameters */ - mpi_errno = MPIDI_OFI_dispatch_function(0, NULL, ctrlsend->ackreq); - goto fn_exit; - } + case MPIDI_OFI_CTRL_HUGEACK: + mpi_errno = MPIDI_OFI_dispatch_function(0, NULL, ctrlsend->u.huge_ack.ackreq); break; - case MPIDI_OFI_CTRL_HUGE:{ - mpi_errno = MPIDI_OFI_get_huge(0, ctrlsend); - goto fn_exit; - } + case MPIDI_OFI_CTRL_HUGE: + mpi_errno = MPIDI_OFI_get_huge(0, &(ctrlsend->u.huge)); break; default: From 9d483181857fad1f93ef2c0ea9b4e7a3b7bb8f0b Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 24 Sep 2021 12:12:22 -0500 Subject: [PATCH 06/21] ch4/ofi: refactor huge functions to separate file It is not critical to inline functions related to huge messages as they are bandwidth dominated. Move them into ofi_huge.c for better context. --- src/mpid/ch4/netmod/ofi/Makefile.mk | 1 + src/mpid/ch4/netmod/ofi/ofi_events.c | 301 +-------------------- src/mpid/ch4/netmod/ofi/ofi_events.h | 1 - src/mpid/ch4/netmod/ofi/ofi_huge.c | 388 +++++++++++++++++++++++++++ src/mpid/ch4/netmod/ofi/ofi_impl.h | 4 + src/mpid/ch4/netmod/ofi/util.c | 86 +----- 6 files changed, 405 insertions(+), 376 deletions(-) create mode 100644 src/mpid/ch4/netmod/ofi/ofi_huge.c diff --git a/src/mpid/ch4/netmod/ofi/Makefile.mk b/src/mpid/ch4/netmod/ofi/Makefile.mk index 490aa18e0f4..6b107f60921 100644 --- a/src/mpid/ch4/netmod/ofi/Makefile.mk +++ b/src/mpid/ch4/netmod/ofi/Makefile.mk @@ -16,6 +16,7 @@ mpi_core_sources += src/mpid/ch4/netmod/ofi/func_table.c \ src/mpid/ch4/netmod/ofi/ofi_win.c \ src/mpid/ch4/netmod/ofi/ofi_part.c \ src/mpid/ch4/netmod/ofi/ofi_events.c \ + src/mpid/ch4/netmod/ofi/ofi_huge.c \ src/mpid/ch4/netmod/ofi/ofi_progress.c \ src/mpid/ch4/netmod/ofi/ofi_am_events.c \ src/mpid/ch4/netmod/ofi/ofi_nic.c \ diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 5ef501c1b33..f84d17808dd 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -14,10 +14,8 @@ static int peek_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); static int peek_empty_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); -static int recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); static int send_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq); static int ssend_ack_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq); -static uintptr_t recv_rbase(MPIDI_OFI_huge_recv_t * recv); static int chunk_done_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); static int inject_emu_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); static int accept_probe_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); @@ -32,76 +30,25 @@ static int am_read_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * static int peek_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; - size_t count = 0; MPIR_FUNC_ENTER; - rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); - rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); - rreq->status.MPI_ERROR = MPI_SUCCESS; if (MPIDI_OFI_HUGE_SEND & wc->tag) { - MPIDI_OFI_huge_recv_t *list_ptr; - bool found_msg = false; - - /* If this is a huge message, find the control message on the unexpected list that matches - * with this and return the size in that. */ - LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { - uint64_t context_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; - uint64_t tag = MPIDI_OFI_TAG_MASK & wc->tag; - if (list_ptr->remote_info.comm_id == context_id && - list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && - list_ptr->remote_info.tag == tag) { - count = list_ptr->remote_info.msgsize; - found_msg = true; - } - } - if (!found_msg) { - MPIDI_OFI_huge_recv_t *recv_elem; - MPIDI_OFI_huge_recv_list_t *huge_list_ptr; - - /* Create an element in the posted list that only indicates a peek and will be - * deleted as soon as it's fulfilled without being matched. */ - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); - MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - recv_elem->peek = true; - MPIR_Comm *comm_ptr = rreq->comm; - recv_elem->comm_ptr = comm_ptr; - MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, - MPL_MEM_BUFFER); - - huge_list_ptr = - (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM); - MPIR_ERR_CHKANDJUMP(huge_list_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - recv_elem->remote_info.comm_id = huge_list_ptr->comm_id = - MPIDI_OFI_CONTEXT_MASK & wc->tag; - recv_elem->remote_info.origin_rank = huge_list_ptr->rank = - MPIDI_OFI_cqe_get_source(wc, false); - recv_elem->remote_info.tag = huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; - recv_elem->localreq = huge_list_ptr->rreq = rreq; - recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->wc = *wc; - if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { - recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; - } - - LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, huge_list_ptr); - goto fn_exit; - } - } else { - /* Otherwise just get the size of the message we've already received. */ - count = wc->len; + mpi_errno = MPIDI_OFI_peek_huge_event(vni, wc, rreq); + goto fn_exit; } - MPIR_STATUS_SET_COUNT(rreq->status, count); + + rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); + rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); + rreq->status.MPI_ERROR = MPI_SUCCESS; + MPIR_STATUS_SET_COUNT(rreq->status, wc->len); /* util_id should be the last thing to change in rreq. Reason is * we use util_id to indicate peek_event has completed and all the * relevant values have been copied to rreq. */ MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); + fn_exit: MPIR_FUNC_EXIT; return mpi_errno; - fn_fail: - goto fn_exit; } static int peek_empty_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) @@ -133,109 +80,6 @@ static int peek_empty_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request return MPI_SUCCESS; } -/* If we posted a huge receive, this event gets called to translate the - * completion queue entry into a get huge event */ -static int recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_huge_recv_t *recv_elem = NULL; - MPIR_Comm *comm_ptr; - MPIR_FUNC_ENTER; - - bool ready_to_get = false; - /* Check that the sender didn't underflow the message by sending less than - * the huge message threshold. When striping is enabled underflow occurs if - * the sender sends < MPIDI_OFI_STRIPE_CHUNK_SIZE through the huge message protocol - * or < MPIDI_OFI_global.stripe_threshold through normal send */ - if (((wc->len < MPIDI_OFI_STRIPE_CHUNK_SIZE || - (wc->len > MPIDI_OFI_STRIPE_CHUNK_SIZE && wc->len < MPIDI_OFI_global.stripe_threshold)) && - MPIDI_OFI_COMM(rreq->comm).enable_striping) || - (wc->len < MPIDI_OFI_global.max_msg_size && !MPIDI_OFI_COMM(rreq->comm).enable_striping)) { - return MPIDI_OFI_recv_event(vni, wc, rreq, MPIDI_OFI_REQUEST(rreq, event_id)); - } - - comm_ptr = rreq->comm; - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)], - wc->len); - /* Check to see if the tracker is already in the unexpected list. - * Otherwise, allocate one. */ - { - MPIDI_OFI_huge_recv_t *list_ptr; - - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "SEARCHING HUGE UNEXPECTED LIST: (%d, %d, %llu)", - comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), - (MPIDI_OFI_TAG_MASK & wc->tag))); - - LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { - if (list_ptr->remote_info.comm_id == comm_ptr->context_id && - list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && - list_ptr->remote_info.tag == (MPIDI_OFI_TAG_MASK & wc->tag)) { - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "MATCHED HUGE UNEXPECTED LIST: (%d, %d, %llu, %d)", - comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), - (MPIDI_OFI_TAG_MASK & wc->tag), rreq->handle)); - - LL_DELETE(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, list_ptr); - - recv_elem = list_ptr; - MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, - MPL_MEM_COMM); - break; - } - } - } - - if (recv_elem) { - ready_to_get = true; - } else { - MPIDI_OFI_huge_recv_list_t *list_ptr; - - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "CREATING HUGE POSTED ENTRY: (%d, %d, %llu)", - comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), - (MPIDI_OFI_TAG_MASK & wc->tag))); - - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); - MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, - MPL_MEM_BUFFER); - - list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*list_ptr), 1, MPL_MEM_BUFFER); - if (!list_ptr) - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - - list_ptr->comm_id = comm_ptr->context_id; - list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); - list_ptr->tag = (MPIDI_OFI_TAG_MASK & wc->tag); - list_ptr->rreq = rreq; - - LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); - } - - /* Plug the information for the huge event into the receive request and go - * to the MPIDI_OFI_get_huge_event function. */ - recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->peek = false; - recv_elem->comm_ptr = comm_ptr; - recv_elem->localreq = rreq; - recv_elem->wc = *wc; - if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { - recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; - } - if (ready_to_get) { - MPIDI_OFI_get_huge_event(vni, NULL, (MPIR_Request *) recv_elem); - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - static int send_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq) { int mpi_errno = MPI_SUCCESS; @@ -304,133 +148,6 @@ static int ssend_ack_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request return mpi_errno; } -static uintptr_t recv_rbase(MPIDI_OFI_huge_recv_t * recv_elem) -{ - if (!MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { - return 0; - } else { - return (uintptr_t) recv_elem->remote_info.send_buf; - } -} - -/* Note: MPIDI_OFI_get_huge_event is invoked from three places -- - * 1. In recv_huge_event, when recv buffer is matched and first chunk received, and - * when control message (with remote info) has also been received. - * 2. In MPIDI_OFI_get_huge, as a callback when control message is received, and - * when first chunk has been matched and received. - * - * recv_huge_event will fill the local request information, and the control message - * callback will fill the remote (sender) information. Lastly -- - * - * 3. As the event function when RDMA read (issued here) completes. - */ -int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_huge_recv_t *recv_elem = (MPIDI_OFI_huge_recv_t *) req; - uint64_t remote_key; - size_t bytesLeft, bytesToGet; - MPIR_FUNC_ENTER; - - void *recv_buf = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_base); - - if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { - /* Subtract one stripe_chunk_size because we send the first chunk via a regular message - * instead of the memory region */ - recv_elem->stripe_size = (recv_elem->remote_info.msgsize - MPIDI_OFI_STRIPE_CHUNK_SIZE) - / MPIDI_OFI_global.num_nics; /* striping */ - - if (recv_elem->stripe_size > MPIDI_OFI_global.max_msg_size) { - recv_elem->stripe_size = MPIDI_OFI_global.max_msg_size; - } - if (recv_elem->chunks_outstanding) - recv_elem->chunks_outstanding--; - bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; - bytesToGet = (bytesLeft <= recv_elem->stripe_size) ? bytesLeft : recv_elem->stripe_size; - } else { - /* Subtract one max_msg_size because we send the first chunk via a regular message - * instead of the memory region */ - bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; - bytesToGet = (bytesLeft <= MPIDI_OFI_global.max_msg_size) ? - bytesLeft : MPIDI_OFI_global.max_msg_size; - } - if (bytesToGet == 0ULL && recv_elem->chunks_outstanding == 0) { - MPIDI_OFI_send_control_t ctrl; - /* recv_elem->localreq may be freed during MPIDI_OFI_recv_event. - * Need to backup the handle here for later use with MPIDIU_map_erase. */ - uint64_t key_to_erase = recv_elem->localreq->handle; - recv_elem->wc.len = recv_elem->cur_offset; - MPIDI_OFI_recv_event(vni, &recv_elem->wc, recv_elem->localreq, recv_elem->event_id); - ctrl.type = MPIDI_OFI_CTRL_HUGEACK; - ctrl.u.huge_ack.ackreq = recv_elem->remote_info.ackreq; - /* note: it's receiver ack sender */ - int vni_remote = recv_elem->remote_info.vni_src; - int vni_local = recv_elem->remote_info.vni_dst; - mpi_errno = MPIDI_NM_am_send_hdr(recv_elem->remote_info.origin_rank, recv_elem->comm_ptr, - MPIDI_OFI_INTERNAL_HANDLER_CONTROL, - &ctrl, sizeof(ctrl), vni_local, vni_remote); - MPIR_ERR_CHECK(mpi_errno); - - MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, key_to_erase); - MPL_free(recv_elem); - - goto fn_exit; - } - - int nic = 0; - int vni_src = recv_elem->remote_info.vni_src; - int vni_dst = recv_elem->remote_info.vni_dst; - if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { /* if striping enabled */ - MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); - if (recv_elem->cur_offset >= MPIDI_OFI_STRIPE_CHUNK_SIZE && bytesLeft > 0) { - for (nic = 0; nic < MPIDI_OFI_global.num_nics; nic++) { - int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_dst, nic); - remote_key = recv_elem->remote_info.rma_keys[nic]; - - bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; - if (bytesLeft <= 0) { - break; - } - bytesToGet = - (bytesLeft <= recv_elem->stripe_size) ? bytesLeft : recv_elem->stripe_size; - - MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ - bytesToGet, /* bytes */ - NULL, /* descriptor */ - MPIDI_OFI_comm_to_phys(recv_elem->comm_ptr, recv_elem->remote_info.origin_rank, nic, vni_dst, vni_src), recv_rbase(recv_elem) + recv_elem->cur_offset, /* remote maddr */ - remote_key, /* Key */ - (void *) &recv_elem->context), nic, /* Context */ - rdma_readfrom, FALSE); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[nic], bytesToGet); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_recvd_bytes_count[nic], bytesToGet); - recv_elem->cur_offset += bytesToGet; - recv_elem->chunks_outstanding++; - } - } - } else { - int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_src, nic); - remote_key = recv_elem->remote_info.rma_keys[nic]; - MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); - MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, /* endpoint */ - (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ - bytesToGet, /* bytes */ - NULL, /* descriptor */ - MPIDI_OFI_comm_to_phys(recv_elem->comm_ptr, recv_elem->remote_info.origin_rank, nic, vni_src, vni_dst), /* Destination */ - recv_rbase(recv_elem) + recv_elem->cur_offset, /* remote maddr */ - remote_key, /* Key */ - (void *) &recv_elem->context), vni_src, rdma_readfrom, /* Context */ - FALSE); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[nic], bytesToGet); - recv_elem->cur_offset += bytesToGet; - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - static int chunk_done_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req) { int c; @@ -771,7 +488,7 @@ int MPIDI_OFI_dispatch_function(int vni, struct fi_cq_tagged_entry *wc, MPIR_Req break; case MPIDI_OFI_EVENT_RECV_HUGE: - mpi_errno = recv_huge_event(vni, wc, req); + mpi_errno = MPIDI_OFI_recv_huge_event(vni, wc, req); break; case MPIDI_OFI_EVENT_RECV_PACK: diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h index ec6dc1f27c5..b4b00230e37 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.h +++ b/src/mpid/ch4/netmod/ofi/ofi_events.h @@ -12,7 +12,6 @@ #include "utlist.h" int MPIDI_OFI_rma_done_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * in_req); -int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); int MPIDI_OFI_dispatch_function(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_cqe_get_source(struct fi_cq_tagged_entry *wc, bool has_err) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c new file mode 100644 index 00000000000..a3fa2ea2713 --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include +#include "ofi_impl.h" +#include "ofi_events.h" + +/* this function called by recv event of a huge message */ +int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_huge_recv_t *recv_elem = NULL; + MPIR_Comm *comm_ptr; + MPIR_FUNC_ENTER; + + bool ready_to_get = false; + /* Check that the sender didn't underflow the message by sending less than + * the huge message threshold. When striping is enabled underflow occurs if + * the sender sends < MPIDI_OFI_STRIPE_CHUNK_SIZE through the huge message protocol + * or < MPIDI_OFI_global.stripe_threshold through normal send */ + if (((wc->len < MPIDI_OFI_STRIPE_CHUNK_SIZE || + (wc->len > MPIDI_OFI_STRIPE_CHUNK_SIZE && wc->len < MPIDI_OFI_global.stripe_threshold)) && + MPIDI_OFI_COMM(rreq->comm).enable_striping) || + (wc->len < MPIDI_OFI_global.max_msg_size && !MPIDI_OFI_COMM(rreq->comm).enable_striping)) { + return MPIDI_OFI_recv_event(vni, wc, rreq, MPIDI_OFI_REQUEST(rreq, event_id)); + } + + comm_ptr = rreq->comm; + MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)], + wc->len); + /* Check to see if the tracker is already in the unexpected list. + * Otherwise, allocate one. */ + { + MPIDI_OFI_huge_recv_t *list_ptr; + + MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, + (MPL_DBG_FDEST, "SEARCHING HUGE UNEXPECTED LIST: (%d, %d, %llu)", + comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), + (MPIDI_OFI_TAG_MASK & wc->tag))); + + LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { + if (list_ptr->remote_info.comm_id == comm_ptr->context_id && + list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && + list_ptr->remote_info.tag == (MPIDI_OFI_TAG_MASK & wc->tag)) { + MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, + (MPL_DBG_FDEST, "MATCHED HUGE UNEXPECTED LIST: (%d, %d, %llu, %d)", + comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), + (MPIDI_OFI_TAG_MASK & wc->tag), rreq->handle)); + + LL_DELETE(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, list_ptr); + + recv_elem = list_ptr; + MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, + MPL_MEM_COMM); + break; + } + } + } + + if (recv_elem) { + ready_to_get = true; + } else { + MPIDI_OFI_huge_recv_list_t *list_ptr; + + MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, + (MPL_DBG_FDEST, "CREATING HUGE POSTED ENTRY: (%d, %d, %llu)", + comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), + (MPIDI_OFI_TAG_MASK & wc->tag))); + + recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); + MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); + MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, + MPL_MEM_BUFFER); + + list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*list_ptr), 1, MPL_MEM_BUFFER); + if (!list_ptr) + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); + + list_ptr->comm_id = comm_ptr->context_id; + list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); + list_ptr->tag = (MPIDI_OFI_TAG_MASK & wc->tag); + list_ptr->rreq = rreq; + + LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); + } + + /* Plug the information for the huge event into the receive request and go + * to the MPIDI_OFI_get_huge_event function. */ + recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; + recv_elem->peek = false; + recv_elem->comm_ptr = comm_ptr; + recv_elem->localreq = rreq; + recv_elem->wc = *wc; + if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { + recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; + } else { + recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; + } + if (ready_to_get) { + MPIDI_OFI_get_huge_event(vni, NULL, (MPIR_Request *) recv_elem); + } + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* This function is called when we receive a huge control message */ +int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info) +{ + MPIDI_OFI_huge_recv_t *recv_elem = NULL; + int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_ENTER; + + bool ready_to_get = false; + + /* If there has been a posted receive, search through the list of unmatched + * receives to find the one that goes with the incoming message. */ + { + MPIDI_OFI_huge_recv_list_t *list_ptr; + + MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, + (MPL_DBG_FDEST, "SEARCHING POSTED LIST: (%d, %d, %d)", info->comm_id, + info->origin_rank, info->tag)); + + LL_FOREACH(MPIDI_posted_huge_recv_head, list_ptr) { + if (list_ptr->comm_id == info->comm_id && + list_ptr->rank == info->origin_rank && list_ptr->tag == info->tag) { + MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, + (MPL_DBG_FDEST, "MATCHED POSTED LIST: (%d, %d, %d, %d)", + info->comm_id, info->origin_rank, info->tag, + list_ptr->rreq->handle)); + + LL_DELETE(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); + + recv_elem = (MPIDI_OFI_huge_recv_t *) + MPIDIU_map_lookup(MPIDI_OFI_global.huge_recv_counters, list_ptr->rreq->handle); + + /* If this is a "peek" element for an MPI_Probe, it shouldn't be matched. Grab the + * important information and remove the element from the list. */ + if (recv_elem->peek) { + MPIR_STATUS_SET_COUNT(recv_elem->localreq->status, info->msgsize); + MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(recv_elem->localreq, util_id)), + MPIDI_OFI_PEEK_FOUND); + MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, + recv_elem->localreq->handle); + MPL_free(recv_elem); + recv_elem = NULL; + } + + MPL_free(list_ptr); + break; + } + } + } + + if (recv_elem) { + ready_to_get = true; + } else { + /* Put the struct describing the transfer on an unexpected list to be retrieved later */ + MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, + (MPL_DBG_FDEST, "CREATING UNEXPECTED HUGE RECV: (%d, %d, %d)", + info->comm_id, info->origin_rank, info->tag)); + + /* If this is unexpected, create a new tracker and put it in the unexpected list. */ + recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); + if (!recv_elem) + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); + + LL_APPEND(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, recv_elem); + } + + recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; + recv_elem->remote_info = *info; + recv_elem->next = NULL; + if (ready_to_get) { + MPIDI_OFI_get_huge_event(info->vni_dst, NULL, (MPIR_Request *) recv_elem); + } + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_ENTER; + + MPI_Aint count = 0; + MPIDI_OFI_huge_recv_t *list_ptr; + bool found_msg = false; + + /* If this is a huge message, find the control message on the unexpected list that matches + * with this and return the size in that. */ + LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { + uint64_t context_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; + uint64_t tag = MPIDI_OFI_TAG_MASK & wc->tag; + if (list_ptr->remote_info.comm_id == context_id && + list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && + list_ptr->remote_info.tag == tag) { + count = list_ptr->remote_info.msgsize; + found_msg = true; + } + } + if (!found_msg) { + /* FIXME: the count is wrong in this case. We need progress until the control message is received */ + MPIDI_OFI_huge_recv_t *recv_elem; + MPIDI_OFI_huge_recv_list_t *huge_list_ptr; + + /* Create an element in the posted list that only indicates a peek and will be + * deleted as soon as it's fulfilled without being matched. */ + recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); + MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); + recv_elem->peek = true; + MPIR_Comm *comm_ptr = rreq->comm; + recv_elem->comm_ptr = comm_ptr; + MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, + MPL_MEM_BUFFER); + + huge_list_ptr = + (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM); + MPIR_ERR_CHKANDJUMP(huge_list_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); + recv_elem->remote_info.comm_id = huge_list_ptr->comm_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; + recv_elem->remote_info.origin_rank = huge_list_ptr->rank = + MPIDI_OFI_cqe_get_source(wc, false); + recv_elem->remote_info.tag = huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; + recv_elem->localreq = huge_list_ptr->rreq = rreq; + recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; + recv_elem->wc = *wc; + if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { + recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; + } else { + recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; + } + + LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, huge_list_ptr); + } + + rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); + rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); + rreq->status.MPI_ERROR = MPI_SUCCESS; + MPIR_STATUS_SET_COUNT(rreq->status, count); + /* util_id should be the last thing to change in rreq. Reason is + * we use util_id to indicate peek_event has completed and all the + * relevant values have been copied to rreq. */ + MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static uintptr_t recv_rbase(MPIDI_OFI_huge_recv_t * recv_elem) +{ + if (!MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { + return 0; + } else { + return (uintptr_t) recv_elem->remote_info.send_buf; + } +} + +/* Note: MPIDI_OFI_get_huge_event is invoked from three places -- + * 1. In MPIDI_OFI_recv_huge_event, when recv buffer is matched and first chunk received, and + * when control message (with remote info) has also been received. + * 2. In MPIDI_OFI_recv_huge_control, as a callback when control message is received, and + * when first chunk has been matched and received. + * + * MPIDI_OFI_recv_huge_event will fill the local request information, and + * MPIDI_OFI_recv_huge_control will fill the remote (sender) information. Lastly -- + * + * 3. As the event function when RDMA read (issued here) completes. + */ +int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_huge_recv_t *recv_elem = (MPIDI_OFI_huge_recv_t *) req; + uint64_t remote_key; + size_t bytesLeft, bytesToGet; + MPIR_FUNC_ENTER; + + void *recv_buf = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_base); + + if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { + /* Subtract one stripe_chunk_size because we send the first chunk via a regular message + * instead of the memory region */ + recv_elem->stripe_size = (recv_elem->remote_info.msgsize - MPIDI_OFI_STRIPE_CHUNK_SIZE) + / MPIDI_OFI_global.num_nics; /* striping */ + + if (recv_elem->stripe_size > MPIDI_OFI_global.max_msg_size) { + recv_elem->stripe_size = MPIDI_OFI_global.max_msg_size; + } + if (recv_elem->chunks_outstanding) + recv_elem->chunks_outstanding--; + bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; + bytesToGet = (bytesLeft <= recv_elem->stripe_size) ? bytesLeft : recv_elem->stripe_size; + } else { + /* Subtract one max_msg_size because we send the first chunk via a regular message + * instead of the memory region */ + bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; + bytesToGet = (bytesLeft <= MPIDI_OFI_global.max_msg_size) ? + bytesLeft : MPIDI_OFI_global.max_msg_size; + } + if (bytesToGet == 0ULL && recv_elem->chunks_outstanding == 0) { + MPIDI_OFI_send_control_t ctrl; + /* recv_elem->localreq may be freed during MPIDI_OFI_recv_event. + * Need to backup the handle here for later use with MPIDIU_map_erase. */ + uint64_t key_to_erase = recv_elem->localreq->handle; + recv_elem->wc.len = recv_elem->cur_offset; + MPIDI_OFI_recv_event(recv_elem->remote_info.vni_dst, &recv_elem->wc, recv_elem->localreq, + recv_elem->event_id); + ctrl.type = MPIDI_OFI_CTRL_HUGEACK; + ctrl.u.huge_ack.ackreq = recv_elem->remote_info.ackreq; + /* note: it's receiver ack sender */ + int vni_remote = recv_elem->remote_info.vni_src; + int vni_local = recv_elem->remote_info.vni_dst; + mpi_errno = MPIDI_NM_am_send_hdr(recv_elem->remote_info.origin_rank, recv_elem->comm_ptr, + MPIDI_OFI_INTERNAL_HANDLER_CONTROL, + &ctrl, sizeof(ctrl), vni_local, vni_remote); + MPIR_ERR_CHECK(mpi_errno); + + MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, key_to_erase); + MPL_free(recv_elem); + + goto fn_exit; + } + + int nic = 0; + int vni_src = recv_elem->remote_info.vni_src; + int vni_dst = recv_elem->remote_info.vni_dst; + if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { /* if striping enabled */ + MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); + if (recv_elem->cur_offset >= MPIDI_OFI_STRIPE_CHUNK_SIZE && bytesLeft > 0) { + for (nic = 0; nic < MPIDI_OFI_global.num_nics; nic++) { + int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_dst, nic); + remote_key = recv_elem->remote_info.rma_keys[nic]; + + bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; + if (bytesLeft <= 0) { + break; + } + bytesToGet = + (bytesLeft <= recv_elem->stripe_size) ? bytesLeft : recv_elem->stripe_size; + + MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ + bytesToGet, /* bytes */ + NULL, /* descriptor */ + MPIDI_OFI_comm_to_phys(recv_elem->comm_ptr, recv_elem->remote_info.origin_rank, nic, vni_dst, vni_src), recv_rbase(recv_elem) + recv_elem->cur_offset, /* remote maddr */ + remote_key, /* Key */ + (void *) &recv_elem->context), nic, /* Context */ + rdma_readfrom, FALSE); + MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[nic], bytesToGet); + MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_recvd_bytes_count[nic], bytesToGet); + recv_elem->cur_offset += bytesToGet; + recv_elem->chunks_outstanding++; + } + } + } else { + int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_src, nic); + remote_key = recv_elem->remote_info.rma_keys[nic]; + MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); + MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, /* endpoint */ + (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ + bytesToGet, /* bytes */ + NULL, /* descriptor */ + MPIDI_OFI_comm_to_phys(recv_elem->comm_ptr, recv_elem->remote_info.origin_rank, nic, vni_src, vni_dst), /* Destination */ + recv_rbase(recv_elem) + recv_elem->cur_offset, /* remote maddr */ + remote_key, /* Key */ + (void *) &recv_elem->context), vni_src, rdma_readfrom, /* Context */ + FALSE); + MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[nic], bytesToGet); + recv_elem->cur_offset += bytesToGet; + } + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 2a52421ff03..5d2dfa45929 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -308,6 +308,10 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_cntr_set(int ctx_idx, int val) #define MPIDI_OFI_COLL_MR_KEY 1 #define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL int MPIDI_OFI_retry_progress(void); +int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); +int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info); +int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); +int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, uint32_t attr, MPIR_Request ** req); int MPIDI_OFI_am_rdma_read_ack_handler(void *am_hdr, void *data, diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c index 3521b455060..ed2f77dd2ef 100644 --- a/src/mpid/ch4/netmod/ofi/util.c +++ b/src/mpid/ch4/netmod/ofi/util.c @@ -150,87 +150,6 @@ void MPIDI_OFI_mr_key_allocator_destroy(void) MPL_free(mr_key_allocator.bitmask); } -/* Translate the control message to get a huge message into a request to - * actually perform the data transfer. */ -static int MPIDI_OFI_get_huge(int vni, MPIDI_OFI_huge_remote_info_t * info) -{ - MPIDI_OFI_huge_recv_t *recv_elem = NULL; - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - bool ready_to_get = false; - - /* If there has been a posted receive, search through the list of unmatched - * receives to find the one that goes with the incoming message. */ - { - MPIDI_OFI_huge_recv_list_t *list_ptr; - - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "SEARCHING POSTED LIST: (%d, %d, %d)", info->comm_id, - info->origin_rank, info->tag)); - - LL_FOREACH(MPIDI_posted_huge_recv_head, list_ptr) { - if (list_ptr->comm_id == info->comm_id && - list_ptr->rank == info->origin_rank && list_ptr->tag == info->tag) { - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "MATCHED POSTED LIST: (%d, %d, %d, %d)", - info->comm_id, info->origin_rank, info->tag, - list_ptr->rreq->handle)); - - LL_DELETE(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); - - recv_elem = (MPIDI_OFI_huge_recv_t *) - MPIDIU_map_lookup(MPIDI_OFI_global.huge_recv_counters, list_ptr->rreq->handle); - - /* If this is a "peek" element for an MPI_Probe, it shouldn't be matched. Grab the - * important information and remove the element from the list. */ - if (recv_elem->peek) { - MPIR_STATUS_SET_COUNT(recv_elem->localreq->status, info->msgsize); - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(recv_elem->localreq, util_id)), - MPIDI_OFI_PEEK_FOUND); - MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, - recv_elem->localreq->handle); - MPL_free(recv_elem); - recv_elem = NULL; - } - - MPL_free(list_ptr); - break; - } - } - } - - if (recv_elem) { - ready_to_get = true; - } else { - /* Put the struct describing the transfer on an unexpected list to be retrieved later */ - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "CREATING UNEXPECTED HUGE RECV: (%d, %d, %d)", - info->comm_id, info->origin_rank, info->tag)); - - /* If this is unexpected, create a new tracker and put it in the unexpected list. */ - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); - if (!recv_elem) - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - - LL_APPEND(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, recv_elem); - } - - recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->remote_info = *info; - recv_elem->next = NULL; - if (ready_to_get) { - MPIDI_OFI_get_huge_event(vni, NULL, (MPIR_Request *) recv_elem); - } - - MPIR_FUNC_EXIT; - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, uint32_t attr, MPIR_Request ** req) { @@ -241,13 +160,14 @@ int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, *req = NULL; } + int local_vci = MPIDIG_AM_ATTR_DST_VCI(attr); switch (ctrlsend->type) { case MPIDI_OFI_CTRL_HUGEACK: - mpi_errno = MPIDI_OFI_dispatch_function(0, NULL, ctrlsend->u.huge_ack.ackreq); + mpi_errno = MPIDI_OFI_dispatch_function(local_vci, NULL, ctrlsend->u.huge_ack.ackreq); break; case MPIDI_OFI_CTRL_HUGE: - mpi_errno = MPIDI_OFI_get_huge(0, &(ctrlsend->u.huge)); + mpi_errno = MPIDI_OFI_recv_huge_control(&(ctrlsend->u.huge)); break; default: From 4e9d269d55664631fbbb6c26693052a868f6ba59 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 27 Sep 2021 22:51:49 -0500 Subject: [PATCH 07/21] ch4/ofi: fix fi_read in enable_striping The counter was not incremented for the correct nic. Add FIXME to note potential issues with multiple simultaneous fi_read. --- src/mpid/ch4/netmod/ofi/ofi_huge.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index a3fa2ea2713..4e017fe0f69 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -333,13 +333,11 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques goto fn_exit; } - int nic = 0; int vni_src = recv_elem->remote_info.vni_src; int vni_dst = recv_elem->remote_info.vni_dst; if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { /* if striping enabled */ - MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); if (recv_elem->cur_offset >= MPIDI_OFI_STRIPE_CHUNK_SIZE && bytesLeft > 0) { - for (nic = 0; nic < MPIDI_OFI_global.num_nics; nic++) { + for (int nic = 0; nic < MPIDI_OFI_global.num_nics; nic++) { int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_dst, nic); remote_key = recv_elem->remote_info.rma_keys[nic]; @@ -350,6 +348,8 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques bytesToGet = (bytesLeft <= recv_elem->stripe_size) ? bytesLeft : recv_elem->stripe_size; + /* FIXME: Can we issue concurrent fi_read with the same context? */ + MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ bytesToGet, /* bytes */ NULL, /* descriptor */ @@ -364,6 +364,7 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques } } } else { + int nic = 0; int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_src, nic); remote_key = recv_elem->remote_info.rma_keys[nic]; MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); From 1e1ad688d7f381350e9b20542baa883095c14a56 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 14:50:43 -0500 Subject: [PATCH 08/21] ch4/ofi: detect normal send in event loop Rather than check and fallback in MPIDI_OFI_recv_huge_event, check whether it's a huge message in event loop and dispatch accordingly. --- src/mpid/ch4/netmod/ofi/ofi_events.c | 6 +++++- src/mpid/ch4/netmod/ofi/ofi_huge.c | 13 ++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index f84d17808dd..cbc8229133f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -488,7 +488,11 @@ int MPIDI_OFI_dispatch_function(int vni, struct fi_cq_tagged_entry *wc, MPIR_Req break; case MPIDI_OFI_EVENT_RECV_HUGE: - mpi_errno = MPIDI_OFI_recv_huge_event(vni, wc, req); + if (wc->tag & MPIDI_OFI_HUGE_SEND) { + mpi_errno = MPIDI_OFI_recv_huge_event(vni, wc, req); + } else { + mpi_errno = MPIDI_OFI_recv_event(vni, wc, req, MPIDI_OFI_EVENT_RECV_HUGE); + } break; case MPIDI_OFI_EVENT_RECV_PACK: diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index 4e017fe0f69..b41235ccbd9 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -16,15 +16,10 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque MPIR_FUNC_ENTER; bool ready_to_get = false; - /* Check that the sender didn't underflow the message by sending less than - * the huge message threshold. When striping is enabled underflow occurs if - * the sender sends < MPIDI_OFI_STRIPE_CHUNK_SIZE through the huge message protocol - * or < MPIDI_OFI_global.stripe_threshold through normal send */ - if (((wc->len < MPIDI_OFI_STRIPE_CHUNK_SIZE || - (wc->len > MPIDI_OFI_STRIPE_CHUNK_SIZE && wc->len < MPIDI_OFI_global.stripe_threshold)) && - MPIDI_OFI_COMM(rreq->comm).enable_striping) || - (wc->len < MPIDI_OFI_global.max_msg_size && !MPIDI_OFI_COMM(rreq->comm).enable_striping)) { - return MPIDI_OFI_recv_event(vni, wc, rreq, MPIDI_OFI_REQUEST(rreq, event_id)); + if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { + MPIR_Assert(wc->len == MPIDI_OFI_STRIPE_CHUNK_SIZE); + } else { + MPIR_Assert(wc->len == MPIDI_OFI_global.max_msg_size); } comm_ptr = rreq->comm; From 4f661a7166ad2d58bcef730890419f323027b805 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 24 Sep 2021 14:37:42 -0500 Subject: [PATCH 09/21] ch4/ofi: detect huge message data truncation This partially fixes test/mpi/errors/pt2pt/truncmsg1 when set MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384. We still need fix the case when huge message sent but small buffer is posted. --- src/mpid/ch4/netmod/ofi/ofi_events.c | 2 +- src/mpid/ch4/netmod/ofi/ofi_events.h | 5 +++-- src/mpid/ch4/netmod/ofi/ofi_huge.c | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index cbc8229133f..4d1f0ffa4db 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -585,10 +585,10 @@ int MPIDI_OFI_handle_cq_error(int vni, int nic, ssize_t ret) break; case MPIR_REQUEST_KIND__RECV: + req->status.MPI_ERROR = MPI_ERR_TRUNCATE; mpi_errno = MPIDI_OFI_dispatch_function(vni, (struct fi_cq_tagged_entry *) &e, req); - req->status.MPI_ERROR = MPI_ERR_TRUNCATE; break; default: diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h index b4b00230e37..faacbeb9b9f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.h +++ b/src/mpid/ch4/netmod/ofi/ofi_events.h @@ -54,7 +54,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vni, struct fi_cq_tagged_e MPIR_FUNC_ENTER; rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); - rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); + if (!rreq->status.MPI_ERROR) { + rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); + } rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); count = wc->len; MPIR_STATUS_SET_COUNT(rreq->status, count); @@ -129,7 +131,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vni, struct fi_cq_tagged_e MPIDIU_request_complete(rreq); - /* Polling loop will check for truncation */ fn_exit: MPIR_FUNC_EXIT; return mpi_errno; diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index b41235ccbd9..1b77040385c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -283,6 +283,11 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques MPIR_FUNC_ENTER; void *recv_buf = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_base); + MPI_Aint data_sz = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_len); + if (recv_elem->remote_info.msgsize > data_sz) { + recv_elem->localreq->status.MPI_ERROR = MPI_ERR_TRUNCATE; + recv_elem->remote_info.msgsize = data_sz; + } if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { /* Subtract one stripe_chunk_size because we send the first chunk via a regular message From b63867e51a46f71f037f1c9c26be906f8ff63877 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 24 Sep 2021 14:47:43 -0500 Subject: [PATCH 10/21] ch4/ofi: swap order of sending huge ctrl The progress of huge message will depend on receiving the ctrl message. Send it first to promote the likelihood of ctrl header not arriving too much behind the message body. --- src/mpid/ch4/netmod/ofi/ofi_send.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index ea2412cef7a..56a2cb8334f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -283,8 +283,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou struct fid_mr **huge_send_mrs; uint64_t msg_size = MPIDI_OFI_STRIPE_CHUNK_SIZE; - MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_HUGE; - MPIR_cc_inc(sreq->cc_ptr); if (!MPIDI_OFI_COMM(comm).enable_striping) { num_nics = 1; @@ -333,18 +331,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou MPIR_Comm_add_ref(comm); /* Store ordering unnecessary for dst_rank, so use relaxed store */ MPL_atomic_relaxed_store_int(&MPIDI_OFI_REQUEST(sreq, util_id), dst_rank); - match_bits |= MPIDI_OFI_HUGE_SEND; /* Add the bit for a huge message */ - MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, - send_buf, msg_size, NULL /* desc */ , - cq_data, - MPIDI_OFI_av_to_phys(addr, receiver_nic, vni_local, - vni_remote), - match_bits, - (void *) &(MPIDI_OFI_REQUEST(sreq, context))), - vni_local, tsenddata, FALSE /* eagain */); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], msg_size); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_sent_bytes_count[sender_nic], msg_size); + /* send ctrl message first */ MPIDI_OFI_send_control_t ctrl; ctrl.type = MPIDI_OFI_CTRL_HUGE; for (int i = 0; i < num_nics; i++) { @@ -359,10 +347,24 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou ctrl.u.huge.comm_id = comm->context_id; ctrl.u.huge.ackreq = sreq; - /* Send information about the memory region here to get the lmt going. */ mpi_errno = MPIDI_NM_am_send_hdr(dst_rank, comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL, &ctrl, sizeof(ctrl), vni_src, vni_dst); MPIR_ERR_CHECK(mpi_errno); + + /* send main native message next */ + MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_HUGE; + + match_bits |= MPIDI_OFI_HUGE_SEND; /* Add the bit for a huge message */ + MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, + send_buf, msg_size, NULL /* desc */ , + cq_data, + MPIDI_OFI_av_to_phys(addr, receiver_nic, vni_local, + vni_remote), + match_bits, + (void *) &(MPIDI_OFI_REQUEST(sreq, context))), + vni_local, tsenddata, FALSE /* eagain */); + MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], msg_size); + MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_sent_bytes_count[sender_nic], msg_size); } fn_exit: From 0ec4a61195e370ba7d5a6ec5695c0384a8e9207c Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 09:24:45 -0500 Subject: [PATCH 11/21] ch4/ofi: remove debug messages from huge path The debug messages are temporary and should be removed after debugging. --- src/mpid/ch4/netmod/ofi/ofi_huge.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index 1b77040385c..c669163460f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -30,20 +30,10 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque { MPIDI_OFI_huge_recv_t *list_ptr; - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "SEARCHING HUGE UNEXPECTED LIST: (%d, %d, %llu)", - comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), - (MPIDI_OFI_TAG_MASK & wc->tag))); - LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { if (list_ptr->remote_info.comm_id == comm_ptr->context_id && list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && list_ptr->remote_info.tag == (MPIDI_OFI_TAG_MASK & wc->tag)) { - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "MATCHED HUGE UNEXPECTED LIST: (%d, %d, %llu, %d)", - comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), - (MPIDI_OFI_TAG_MASK & wc->tag), rreq->handle)); - LL_DELETE(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, list_ptr); recv_elem = list_ptr; @@ -59,11 +49,6 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque } else { MPIDI_OFI_huge_recv_list_t *list_ptr; - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "CREATING HUGE POSTED ENTRY: (%d, %d, %llu)", - comm_ptr->context_id, MPIDI_OFI_cqe_get_source(wc, false), - (MPIDI_OFI_TAG_MASK & wc->tag))); - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, @@ -118,18 +103,9 @@ int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info) { MPIDI_OFI_huge_recv_list_t *list_ptr; - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "SEARCHING POSTED LIST: (%d, %d, %d)", info->comm_id, - info->origin_rank, info->tag)); - LL_FOREACH(MPIDI_posted_huge_recv_head, list_ptr) { if (list_ptr->comm_id == info->comm_id && list_ptr->rank == info->origin_rank && list_ptr->tag == info->tag) { - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "MATCHED POSTED LIST: (%d, %d, %d, %d)", - info->comm_id, info->origin_rank, info->tag, - list_ptr->rreq->handle)); - LL_DELETE(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); recv_elem = (MPIDI_OFI_huge_recv_t *) @@ -157,11 +133,6 @@ int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info) ready_to_get = true; } else { /* Put the struct describing the transfer on an unexpected list to be retrieved later */ - MPL_DBG_MSG_FMT(MPIR_DBG_PT2PT, VERBOSE, - (MPL_DBG_FDEST, "CREATING UNEXPECTED HUGE RECV: (%d, %d, %d)", - info->comm_id, info->origin_rank, info->tag)); - - /* If this is unexpected, create a new tracker and put it in the unexpected list. */ recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); if (!recv_elem) MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); From f33a14a8e6cc679e30b74dcc4128ef362795110e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 14:00:53 -0500 Subject: [PATCH 12/21] ch4/ofi: differentiate probe and mprobe of huge messages When probing huge messages and control is missing, we should handle probe and mprobe differently. With probe, we can simply return not found. With mprobe, we can enqueue the rreq since the entry is guaranteed not to be double matched. --- src/mpid/ch4/netmod/ofi/ofi_huge.c | 34 ++++++++++++++++++++--------- src/mpid/ch4/netmod/ofi/ofi_pre.h | 6 +++++ src/mpid/ch4/netmod/ofi/ofi_probe.h | 16 +++++++++----- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index c669163460f..73e632c7471 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -175,8 +175,20 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque found_msg = true; } } - if (!found_msg) { - /* FIXME: the count is wrong in this case. We need progress until the control message is received */ + if (found_msg) { + rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); + rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); + rreq->status.MPI_ERROR = MPI_SUCCESS; + MPIR_STATUS_SET_COUNT(rreq->status, count); + /* util_id should be the last thing to change in rreq. Reason is + * we use util_id to indicate peek_event has completed and all the + * relevant values have been copied to rreq. */ + MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); + } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__probe) { + /* return not found for this probe. User can probe again. */ + MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_NOT_FOUND); + } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { + /* post the rreq to list and let control handler handle it */ MPIDI_OFI_huge_recv_t *recv_elem; MPIDI_OFI_huge_recv_list_t *huge_list_ptr; @@ -207,16 +219,18 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque } LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, huge_list_ptr); + + /* FIXME: we don't have the correct count so it wrong to return FOUND here */ + rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); + rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); + rreq->status.MPI_ERROR = MPI_SUCCESS; + MPIR_STATUS_SET_COUNT(rreq->status, count); + /* util_id should be the last thing to change in rreq. Reason is + * we use util_id to indicate peek_event has completed and all the + * relevant values have been copied to rreq. */ + MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); } - rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); - rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); - rreq->status.MPI_ERROR = MPI_SUCCESS; - MPIR_STATUS_SET_COUNT(rreq->status, count); - /* util_id should be the last thing to change in rreq. Reason is - * we use util_id to indicate peek_event has completed and all the - * relevant values have been copied to rreq. */ - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); fn_exit: MPIR_FUNC_EXIT; diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 593de7ef64e..9a34e1fea74 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -176,6 +176,11 @@ typedef struct { MPI_Aint data_sz; /* save data_sz to avoid double checking */ } MPIDI_OFI_am_request_t; +enum MPIDI_OFI_req_kind { + MPIDI_OFI_req_kind__any, + MPIDI_OFI_req_kind__probe, + MPIDI_OFI_req_kind__mprobe, +}; typedef struct { struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ @@ -184,6 +189,7 @@ typedef struct { MPI_Datatype datatype; int nic_num; /* Store the nic number so we can use it to cancel a request later * if needed. */ + enum MPIDI_OFI_req_kind kind; union { struct { void *buf; diff --git a/src/mpid/ch4/netmod/ofi/ofi_probe.h b/src/mpid/ch4/netmod/ofi/ofi_probe.h index 14181444b55..60b1ed16ac9 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_probe.h +++ b/src/mpid/ch4/netmod/ofi/ofi_probe.h @@ -15,7 +15,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, MPIDI_av_entry_t * addr, int vni_src, int vni_dst, int *flag, MPI_Status * status, - MPIR_Request ** message, uint64_t peek_flags) + MPIR_Request ** message, + enum MPIDI_OFI_req_kind probe_kind) { int mpi_errno = MPI_SUCCESS; fi_addr_t remote_proc; @@ -41,6 +42,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, } else { rreq = &r; } + MPIDI_OFI_REQUEST(rreq, kind) = probe_kind; rreq->comm = comm; MPIR_Comm_add_ref(comm); @@ -58,8 +60,11 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, msg.context = (void *) &(MPIDI_OFI_REQUEST(rreq, context)); msg.data = 0; - MPIDI_OFI_CALL_RETURN(fi_trecvmsg(MPIDI_OFI_global.ctx[ctx_idx].rx, &msg, - peek_flags | FI_PEEK | FI_COMPLETION), ofi_err); + uint64_t recv_flags = FI_PEEK | FI_COMPLETION; + if (probe_kind == MPIDI_OFI_req_kind__mprobe) { + recv_flags |= FI_CLAIM; + } + MPIDI_OFI_CALL_RETURN(fi_trecvmsg(MPIDI_OFI_global.ctx[ctx_idx].rx, &msg, recv_flags), ofi_err); if (ofi_err == -FI_ENOMSG) { *flag = 0; if (message) @@ -138,7 +143,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_improbe(int source, MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vni_dst); /* Set flags for mprobe peek, when ready */ mpi_errno = MPIDI_OFI_do_iprobe(source, tag, comm, context_offset, addr, vni_src, vni_dst, - flag, status, message, FI_CLAIM | FI_COMPLETION); + flag, status, message, MPIDI_OFI_req_kind__mprobe); MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vni_dst); if (mpi_errno != MPI_SUCCESS) @@ -166,7 +171,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_iprobe(int source, } else { MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vni_dst); mpi_errno = MPIDI_OFI_do_iprobe(source, tag, comm, context_offset, addr, - vni_src, vni_dst, flag, status, NULL, 0ULL); + vni_src, vni_dst, flag, status, NULL, + MPIDI_OFI_req_kind__probe); MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vni_dst); } From 54b390563bc249bc537227060eb151e9e4a55430 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 15:04:28 -0500 Subject: [PATCH 13/21] ch4/ofi: remove MPIDI_OFI_global.huge_send_counters Store huge_send_mrs in the sreq so we don't need the extra global map. --- src/mpid/ch4/netmod/ofi/ofi_events.c | 12 +----------- src/mpid/ch4/netmod/ofi/ofi_init.c | 2 -- src/mpid/ch4/netmod/ofi/ofi_pre.h | 4 ++++ src/mpid/ch4/netmod/ofi/ofi_send.h | 4 +--- src/mpid/ch4/netmod/ofi/ofi_types.h | 1 - 5 files changed, 6 insertions(+), 17 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 4d1f0ffa4db..ec7f637dae2 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -90,21 +90,11 @@ static int send_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request if (c == 0) { MPIR_Comm *comm; - void *ptr; struct fid_mr **huge_send_mrs; comm = sreq->comm; num_nics = MPIDI_OFI_COMM(comm).enable_striping ? MPIDI_OFI_global.num_nics : 1; - /* Look for the memory region using the sreq handle */ - ptr = MPIDIU_map_lookup(MPIDI_OFI_global.huge_send_counters, sreq->handle); - MPIR_Assert(ptr != MPIDIU_MAP_NOT_FOUND); - - huge_send_mrs = (struct fid_mr **) ptr; - - /* Send a cleanup message to the receivier and clean up local - * resources. */ - /* Clean up the local counter */ - MPIDIU_map_erase(MPIDI_OFI_global.huge_send_counters, sreq->handle); + huge_send_mrs = MPIDI_OFI_REQUEST(sreq, huge_info.huge_send_mrs); /* Clean up the memory region */ if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index eb941a3183a..6bb1bd91d9d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -559,7 +559,6 @@ int MPIDI_OFI_init_local(int *tag_bits) MPIDIU_map_create(&MPIDI_OFI_global.req_map, MPL_MEM_OTHER); /* Create huge protocol maps */ - MPIDIU_map_create(&MPIDI_OFI_global.huge_send_counters, MPL_MEM_COMM); MPIDIU_map_create(&MPIDI_OFI_global.huge_recv_counters, MPL_MEM_COMM); /* Initialize RMA keys allocator */ @@ -904,7 +903,6 @@ int MPIDI_OFI_mpi_finalize_hook(void) MPIDIU_map_destroy(MPIDI_OFI_global.win_map); MPIDIU_map_destroy(MPIDI_OFI_global.req_map); - MPIDIU_map_destroy(MPIDI_OFI_global.huge_send_counters); MPIDIU_map_destroy(MPIDI_OFI_global.huge_recv_counters); if (MPIDI_OFI_ENABLE_AM) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 9a34e1fea74..9e0afe22a27 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -190,6 +190,10 @@ typedef struct { int nic_num; /* Store the nic number so we can use it to cancel a request later * if needed. */ enum MPIDI_OFI_req_kind kind; + union { + struct fid_mr **huge_send_mrs; + MPIDI_OFI_huge_remote_info_t *info; + } huge_info; union { struct { void *buf; diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 56a2cb8334f..2f6f189d9d0 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -313,9 +313,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou &huge_send_mrs[i], /* Out: memregion object */ NULL), mr_reg); /* In: context */ } - /* Create map to the memory region */ - MPIDIU_map_set(MPIDI_OFI_global.huge_send_counters, sreq->handle, huge_send_mrs, - MPL_MEM_BUFFER); + MPIDI_OFI_REQUEST(sreq, huge_info.huge_send_mrs) = huge_send_mrs; if (MPIDI_OFI_ENABLE_MR_PROV_KEY) { /* MR_BASIC */ for (int i = 0; i < num_nics; i++) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 741bf92be32..9b14d791c62 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -362,7 +362,6 @@ typedef struct { MPIDI_OFI_atomic_valid_t win_op_table[MPIR_DATATYPE_N_PREDEFINED][MPIDIG_ACCU_NUM_OP]; /* huge protocol globals */ - void *huge_send_counters; void *huge_recv_counters; /* Active Message Globals */ From a7e5407a5e2d660e26c61cb0e6bff47fd2d096b1 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 15:49:23 -0500 Subject: [PATCH 14/21] ch4/ofi: remove huge_recv_counters Store the recv_elem pointer with rreq, so we don't need extra huge_recv_counters map. --- src/mpid/ch4/netmod/ofi/ofi_huge.c | 15 ++++----------- src/mpid/ch4/netmod/ofi/ofi_init.c | 5 ----- src/mpid/ch4/netmod/ofi/ofi_pre.h | 2 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 3 --- 4 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index 73e632c7471..0e2d97e63b7 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -37,8 +37,7 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque LL_DELETE(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, list_ptr); recv_elem = list_ptr; - MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, - MPL_MEM_COMM); + MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem) = recv_elem; break; } } @@ -51,8 +50,7 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, - MPL_MEM_BUFFER); + MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem) = recv_elem; list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*list_ptr), 1, MPL_MEM_BUFFER); if (!list_ptr) @@ -108,8 +106,7 @@ int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info) list_ptr->rank == info->origin_rank && list_ptr->tag == info->tag) { LL_DELETE(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); - recv_elem = (MPIDI_OFI_huge_recv_t *) - MPIDIU_map_lookup(MPIDI_OFI_global.huge_recv_counters, list_ptr->rreq->handle); + recv_elem = MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem); /* If this is a "peek" element for an MPI_Probe, it shouldn't be matched. Grab the * important information and remove the element from the list. */ @@ -117,8 +114,6 @@ int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info) MPIR_STATUS_SET_COUNT(recv_elem->localreq->status, info->msgsize); MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(recv_elem->localreq, util_id)), MPIDI_OFI_PEEK_FOUND); - MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, - recv_elem->localreq->handle); MPL_free(recv_elem); recv_elem = NULL; } @@ -199,8 +194,7 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque recv_elem->peek = true; MPIR_Comm *comm_ptr = rreq->comm; recv_elem->comm_ptr = comm_ptr; - MPIDIU_map_set(MPIDI_OFI_global.huge_recv_counters, rreq->handle, recv_elem, - MPL_MEM_BUFFER); + MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem) = recv_elem; huge_list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM); @@ -312,7 +306,6 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques &ctrl, sizeof(ctrl), vni_local, vni_remote); MPIR_ERR_CHECK(mpi_errno); - MPIDIU_map_erase(MPIDI_OFI_global.huge_recv_counters, key_to_erase); MPL_free(recv_elem); goto fn_exit; diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index 6bb1bd91d9d..d9e20b0a288 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -558,9 +558,6 @@ int MPIDI_OFI_init_local(int *tag_bits) MPIDIU_map_create(&MPIDI_OFI_global.win_map, MPL_MEM_RMA); MPIDIU_map_create(&MPIDI_OFI_global.req_map, MPL_MEM_OTHER); - /* Create huge protocol maps */ - MPIDIU_map_create(&MPIDI_OFI_global.huge_recv_counters, MPL_MEM_COMM); - /* Initialize RMA keys allocator */ MPIDI_OFI_mr_key_allocator_init(); @@ -903,8 +900,6 @@ int MPIDI_OFI_mpi_finalize_hook(void) MPIDIU_map_destroy(MPIDI_OFI_global.win_map); MPIDIU_map_destroy(MPIDI_OFI_global.req_map); - MPIDIU_map_destroy(MPIDI_OFI_global.huge_recv_counters); - if (MPIDI_OFI_ENABLE_AM) { for (int vni = 0; vni < MPIDI_OFI_global.num_vnis; vni++) { while (MPIDI_OFI_global.per_vni[vni].am_unordered_msgs) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 9e0afe22a27..e1509e98131 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -192,7 +192,7 @@ typedef struct { enum MPIDI_OFI_req_kind kind; union { struct fid_mr **huge_send_mrs; - MPIDI_OFI_huge_remote_info_t *info; + MPIDI_OFI_huge_recv_t *recv_elem; } huge_info; union { struct { diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 9b14d791c62..c2cefae3baa 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -361,9 +361,6 @@ typedef struct { * OFI provider at MPI initialization.*/ MPIDI_OFI_atomic_valid_t win_op_table[MPIR_DATATYPE_N_PREDEFINED][MPIDIG_ACCU_NUM_OP]; - /* huge protocol globals */ - void *huge_recv_counters; - /* Active Message Globals */ MPL_atomic_int_t am_inflight_inject_emus; MPL_atomic_int_t am_inflight_rma_send_mrs; From 1b4f50542f42f03f1f33eb128275b7ce5855e898 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 10:01:58 -0500 Subject: [PATCH 15/21] ch4/ofi: revamp huge message handling Clean up the data structure to be more specific. Differentiate probe and mprobe. The former can be discarded when the control isn't ready. If we don't discard unsuccessful probe and put it in a queue, it can cause issues when another probe or recv come to interfere. Mprobe, on the other hand, is guaranteed to match once, so there is no issue. Persist the remote info with the original rreq. This avoids the use of separate hash maps to look up. It is also cleaner to track. --- src/mpid/ch4/netmod/ofi/globals.c | 8 +- src/mpid/ch4/netmod/ofi/ofi_events.c | 2 +- src/mpid/ch4/netmod/ofi/ofi_huge.c | 287 +++++++++++++-------------- src/mpid/ch4/netmod/ofi/ofi_impl.h | 3 +- src/mpid/ch4/netmod/ofi/ofi_pre.h | 6 +- src/mpid/ch4/netmod/ofi/ofi_probe.h | 18 +- src/mpid/ch4/netmod/ofi/ofi_recv.h | 4 + src/mpid/ch4/netmod/ofi/ofi_send.h | 20 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 27 ++- src/mpid/ch4/netmod/ofi/util.c | 5 +- 10 files changed, 191 insertions(+), 189 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/globals.c b/src/mpid/ch4/netmod/ofi/globals.c index 67ea56bf50a..40f534b9697 100644 --- a/src/mpid/ch4/netmod/ofi/globals.c +++ b/src/mpid/ch4/netmod/ofi/globals.c @@ -7,10 +7,10 @@ #include "ofi_impl.h" MPIDI_OFI_global_t MPIDI_OFI_global; -MPIDI_OFI_huge_recv_t *MPIDI_unexp_huge_recv_head = NULL; -MPIDI_OFI_huge_recv_t *MPIDI_unexp_huge_recv_tail = NULL; -MPIDI_OFI_huge_recv_list_t *MPIDI_posted_huge_recv_head = NULL; -MPIDI_OFI_huge_recv_list_t *MPIDI_posted_huge_recv_tail = NULL; +MPIDI_OFI_huge_recv_list_t *MPIDI_huge_ctrl_head = NULL; +MPIDI_OFI_huge_recv_list_t *MPIDI_huge_ctrl_tail = NULL; +MPIDI_OFI_huge_recv_list_t *MPIDI_huge_recv_head = NULL; +MPIDI_OFI_huge_recv_list_t *MPIDI_huge_recv_tail = NULL; unsigned long long PVAR_COUNTER_nic_sent_bytes_count[MPIDI_OFI_MAX_NICS] ATTRIBUTE((unused)); unsigned long long PVAR_COUNTER_nic_recvd_bytes_count[MPIDI_OFI_MAX_NICS] ATTRIBUTE((unused)); diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index ec7f637dae2..f8c4c09e171 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -94,7 +94,7 @@ static int send_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request comm = sreq->comm; num_nics = MPIDI_OFI_COMM(comm).enable_striping ? MPIDI_OFI_global.num_nics : 1; - huge_send_mrs = MPIDI_OFI_REQUEST(sreq, huge_info.huge_send_mrs); + huge_send_mrs = MPIDI_OFI_REQUEST(sreq, huge.send_mrs); /* Clean up the memory region */ if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index 0e2d97e63b7..fb75af40417 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -7,11 +7,33 @@ #include "ofi_impl.h" #include "ofi_events.h" +static int get_huge(MPIR_Request * rreq) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info); + MPIDI_OFI_huge_recv_t *recv_elem = NULL; + + recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); + MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); + recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; + recv_elem->localreq = rreq; + if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { + recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; + } else { + recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; + } + MPIDI_OFI_get_huge_event(info->vni_dst, NULL, (MPIR_Request *) recv_elem); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + /* this function called by recv event of a huge message */ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_huge_recv_t *recv_elem = NULL; MPIR_Comm *comm_ptr; MPIR_FUNC_ENTER; @@ -25,33 +47,30 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque comm_ptr = rreq->comm; MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)], wc->len); - /* Check to see if the tracker is already in the unexpected list. - * Otherwise, allocate one. */ - { - MPIDI_OFI_huge_recv_t *list_ptr; - - LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { - if (list_ptr->remote_info.comm_id == comm_ptr->context_id && - list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && - list_ptr->remote_info.tag == (MPIDI_OFI_TAG_MASK & wc->tag)) { - LL_DELETE(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, list_ptr); - - recv_elem = list_ptr; - MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem) = recv_elem; + if (MPIDI_OFI_REQUEST(rreq, huge.remote_info)) { + /* this is mrecv, we already got remote info */ + ready_to_get = true; + } else { + /* Check for remote control info */ + MPIDI_OFI_huge_recv_list_t *list_ptr; + int comm_id = comm_ptr->context_id; + int rank = MPIDI_OFI_cqe_get_source(wc, false); + int tag = (MPIDI_OFI_TAG_MASK & wc->tag); + + LL_FOREACH(MPIDI_huge_ctrl_head, list_ptr) { + if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) { + MPIDI_OFI_REQUEST(rreq, huge.remote_info) = list_ptr->u.info; + LL_DELETE(MPIDI_huge_ctrl_head, MPIDI_huge_ctrl_tail, list_ptr); + MPL_free(list_ptr); + ready_to_get = true; break; } } } - if (recv_elem) { - ready_to_get = true; - } else { + if (!ready_to_get) { MPIDI_OFI_huge_recv_list_t *list_ptr; - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); - MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem) = recv_elem; - list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*list_ptr), 1, MPL_MEM_BUFFER); if (!list_ptr) MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); @@ -59,25 +78,14 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque list_ptr->comm_id = comm_ptr->context_id; list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); list_ptr->tag = (MPIDI_OFI_TAG_MASK & wc->tag); - list_ptr->rreq = rreq; + list_ptr->u.rreq = rreq; - LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); - } - - /* Plug the information for the huge event into the receive request and go - * to the MPIDI_OFI_get_huge_event function. */ - recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->peek = false; - recv_elem->comm_ptr = comm_ptr; - recv_elem->localreq = rreq; - recv_elem->wc = *wc; - if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { - recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; + LL_APPEND(MPIDI_huge_recv_head, MPIDI_huge_recv_tail, list_ptr); + /* control handler will finish the recv */ } else { - recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; - } - if (ready_to_get) { - MPIDI_OFI_get_huge_event(vni, NULL, (MPIR_Request *) recv_elem); + /* proceed to get the huge message */ + mpi_errno = get_huge(rreq); + MPIR_ERR_CHECK(mpi_errno); } fn_exit: @@ -88,58 +96,55 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque } /* This function is called when we receive a huge control message */ -int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info) +int MPIDI_OFI_recv_huge_control(int comm_id, int rank, int tag, + MPIDI_OFI_huge_remote_info_t * info_ptr) { - MPIDI_OFI_huge_recv_t *recv_elem = NULL; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; - bool ready_to_get = false; + MPIDI_OFI_huge_recv_list_t *list_ptr; + MPIR_Request *rreq = NULL; + MPIDI_OFI_huge_remote_info_t *info; + + /* need persist the info. It will eventually get freed at recv completion */ + info = MPL_malloc(sizeof(MPIDI_OFI_huge_remote_info_t), MPL_MEM_OTHER); + MPIR_Assert(info); + memcpy(info, info_ptr, sizeof(*info)); /* If there has been a posted receive, search through the list of unmatched * receives to find the one that goes with the incoming message. */ - { - MPIDI_OFI_huge_recv_list_t *list_ptr; - - LL_FOREACH(MPIDI_posted_huge_recv_head, list_ptr) { - if (list_ptr->comm_id == info->comm_id && - list_ptr->rank == info->origin_rank && list_ptr->tag == info->tag) { - LL_DELETE(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, list_ptr); - - recv_elem = MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem); - - /* If this is a "peek" element for an MPI_Probe, it shouldn't be matched. Grab the - * important information and remove the element from the list. */ - if (recv_elem->peek) { - MPIR_STATUS_SET_COUNT(recv_elem->localreq->status, info->msgsize); - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(recv_elem->localreq, util_id)), - MPIDI_OFI_PEEK_FOUND); - MPL_free(recv_elem); - recv_elem = NULL; - } - - MPL_free(list_ptr); - break; - } + LL_FOREACH(MPIDI_huge_recv_head, list_ptr) { + if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) { + rreq = list_ptr->u.rreq; + LL_DELETE(MPIDI_huge_recv_head, MPIDI_huge_recv_tail, list_ptr); + MPL_free(list_ptr); + break; } } - if (recv_elem) { - ready_to_get = true; - } else { - /* Put the struct describing the transfer on an unexpected list to be retrieved later */ - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); - if (!recv_elem) + if (!rreq) { + list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(MPIDI_OFI_huge_recv_list_t), + 1, MPL_MEM_OTHER); + if (!list_ptr) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); + } + list_ptr->comm_id = comm_id; + list_ptr->rank = rank; + list_ptr->tag = tag; + list_ptr->u.info = info; - LL_APPEND(MPIDI_unexp_huge_recv_head, MPIDI_unexp_huge_recv_tail, recv_elem); - } - - recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->remote_info = *info; - recv_elem->next = NULL; - if (ready_to_get) { - MPIDI_OFI_get_huge_event(info->vni_dst, NULL, (MPIR_Request *) recv_elem); + LL_APPEND(MPIDI_huge_ctrl_head, MPIDI_huge_ctrl_tail, list_ptr); + /* let MPIDI_OFI_recv_huge_event finish the recv */ + } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { + /* attach info and finish the mprobe */ + MPIDI_OFI_REQUEST(rreq, huge.remote_info) = info; + MPIR_STATUS_SET_COUNT(rreq->status, info->msgsize); + MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); + } else { + /* attach info and finish recv */ + MPIDI_OFI_REQUEST(rreq, huge.remote_info) = info; + mpi_errno = get_huge(rreq); + MPIR_ERR_CHECK(mpi_errno); } fn_exit: @@ -155,22 +160,28 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque MPIR_FUNC_ENTER; MPI_Aint count = 0; - MPIDI_OFI_huge_recv_t *list_ptr; + MPIDI_OFI_huge_recv_list_t *list_ptr; bool found_msg = false; /* If this is a huge message, find the control message on the unexpected list that matches * with this and return the size in that. */ - LL_FOREACH(MPIDI_unexp_huge_recv_head, list_ptr) { + LL_FOREACH(MPIDI_huge_ctrl_head, list_ptr) { + /* FIXME: fix the type of comm_id */ uint64_t context_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; - uint64_t tag = MPIDI_OFI_TAG_MASK & wc->tag; - if (list_ptr->remote_info.comm_id == context_id && - list_ptr->remote_info.origin_rank == MPIDI_OFI_cqe_get_source(wc, false) && - list_ptr->remote_info.tag == tag) { - count = list_ptr->remote_info.msgsize; + int rank = MPIDI_OFI_cqe_get_source(wc, false); + int tag = (int) (MPIDI_OFI_TAG_MASK & wc->tag); + if (list_ptr->comm_id == context_id && list_ptr->rank == rank && list_ptr->tag == tag) { + count = list_ptr->u.info->msgsize; found_msg = true; + break; } } if (found_msg) { + if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { + MPIDI_OFI_REQUEST(rreq, huge.remote_info) = list_ptr->u.info; + LL_DELETE(MPIDI_huge_ctrl_head, MPIDI_huge_ctrl_tail, list_ptr); + MPL_free(list_ptr); + } rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); rreq->status.MPI_ERROR = MPI_SUCCESS; @@ -183,46 +194,24 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque /* return not found for this probe. User can probe again. */ MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_NOT_FOUND); } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { + /* fill the status with wc info. Count is still missing */ + rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); + rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); + rreq->status.MPI_ERROR = MPI_SUCCESS; + /* post the rreq to list and let control handler handle it */ - MPIDI_OFI_huge_recv_t *recv_elem; MPIDI_OFI_huge_recv_list_t *huge_list_ptr; - /* Create an element in the posted list that only indicates a peek and will be - * deleted as soon as it's fulfilled without being matched. */ - recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_COMM); - MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - recv_elem->peek = true; - MPIR_Comm *comm_ptr = rreq->comm; - recv_elem->comm_ptr = comm_ptr; - MPIDI_OFI_REQUEST(rreq, huge_info.recv_elem) = recv_elem; - huge_list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(huge_list_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - recv_elem->remote_info.comm_id = huge_list_ptr->comm_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; - recv_elem->remote_info.origin_rank = huge_list_ptr->rank = - MPIDI_OFI_cqe_get_source(wc, false); - recv_elem->remote_info.tag = huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; - recv_elem->localreq = huge_list_ptr->rreq = rreq; - recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; - recv_elem->wc = *wc; - if (MPIDI_OFI_COMM(comm_ptr).enable_striping) { - recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; - } - LL_APPEND(MPIDI_posted_huge_recv_head, MPIDI_posted_huge_recv_tail, huge_list_ptr); + huge_list_ptr->comm_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; + huge_list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); + huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; + huge_list_ptr->u.rreq = rreq; - /* FIXME: we don't have the correct count so it wrong to return FOUND here */ - rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, false); - rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); - rreq->status.MPI_ERROR = MPI_SUCCESS; - MPIR_STATUS_SET_COUNT(rreq->status, count); - /* util_id should be the last thing to change in rreq. Reason is - * we use util_id to indicate peek_event has completed and all the - * relevant values have been copied to rreq. */ - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, util_id)), MPIDI_OFI_PEEK_FOUND); + LL_APPEND(MPIDI_huge_recv_head, MPIDI_huge_recv_tail, huge_list_ptr); } @@ -233,12 +222,12 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque goto fn_exit; } -static uintptr_t recv_rbase(MPIDI_OFI_huge_recv_t * recv_elem) +static uintptr_t recv_rbase(MPIDI_OFI_huge_remote_info_t * remote_info) { if (!MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { return 0; } else { - return (uintptr_t) recv_elem->remote_info.send_buf; + return (uintptr_t) remote_info->send_buf; } } @@ -257,21 +246,23 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques { int mpi_errno = MPI_SUCCESS; MPIDI_OFI_huge_recv_t *recv_elem = (MPIDI_OFI_huge_recv_t *) req; + MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(recv_elem->localreq, huge.remote_info); + MPIR_Comm *comm = recv_elem->localreq->comm; uint64_t remote_key; size_t bytesLeft, bytesToGet; MPIR_FUNC_ENTER; void *recv_buf = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_base); MPI_Aint data_sz = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_len); - if (recv_elem->remote_info.msgsize > data_sz) { + if (info->msgsize > data_sz) { recv_elem->localreq->status.MPI_ERROR = MPI_ERR_TRUNCATE; - recv_elem->remote_info.msgsize = data_sz; + info->msgsize = data_sz; } - if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { + if (MPIDI_OFI_COMM(comm).enable_striping) { /* Subtract one stripe_chunk_size because we send the first chunk via a regular message * instead of the memory region */ - recv_elem->stripe_size = (recv_elem->remote_info.msgsize - MPIDI_OFI_STRIPE_CHUNK_SIZE) + recv_elem->stripe_size = (info->msgsize - MPIDI_OFI_STRIPE_CHUNK_SIZE) / MPIDI_OFI_global.num_nics; /* striping */ if (recv_elem->stripe_size > MPIDI_OFI_global.max_msg_size) { @@ -279,47 +270,49 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques } if (recv_elem->chunks_outstanding) recv_elem->chunks_outstanding--; - bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; + bytesLeft = info->msgsize - recv_elem->cur_offset; bytesToGet = (bytesLeft <= recv_elem->stripe_size) ? bytesLeft : recv_elem->stripe_size; } else { /* Subtract one max_msg_size because we send the first chunk via a regular message * instead of the memory region */ - bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; + bytesLeft = info->msgsize - recv_elem->cur_offset; bytesToGet = (bytesLeft <= MPIDI_OFI_global.max_msg_size) ? bytesLeft : MPIDI_OFI_global.max_msg_size; } if (bytesToGet == 0ULL && recv_elem->chunks_outstanding == 0) { + int vni = info->vni_dst; + struct fi_cq_tagged_entry wc; + wc.len = recv_elem->cur_offset; + wc.data = info->origin_rank; + wc.tag = info->tag; + MPIDI_OFI_recv_event(vni, &wc, recv_elem->localreq, recv_elem->event_id); + MPIDI_OFI_send_control_t ctrl; - /* recv_elem->localreq may be freed during MPIDI_OFI_recv_event. - * Need to backup the handle here for later use with MPIDIU_map_erase. */ - uint64_t key_to_erase = recv_elem->localreq->handle; - recv_elem->wc.len = recv_elem->cur_offset; - MPIDI_OFI_recv_event(recv_elem->remote_info.vni_dst, &recv_elem->wc, recv_elem->localreq, - recv_elem->event_id); ctrl.type = MPIDI_OFI_CTRL_HUGEACK; - ctrl.u.huge_ack.ackreq = recv_elem->remote_info.ackreq; + ctrl.u.huge_ack.ackreq = info->ackreq; /* note: it's receiver ack sender */ - int vni_remote = recv_elem->remote_info.vni_src; - int vni_local = recv_elem->remote_info.vni_dst; - mpi_errno = MPIDI_NM_am_send_hdr(recv_elem->remote_info.origin_rank, recv_elem->comm_ptr, + int vni_remote = info->vni_src; + int vni_local = info->vni_dst; + mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL, &ctrl, sizeof(ctrl), vni_local, vni_remote); MPIR_ERR_CHECK(mpi_errno); + MPL_free(info); MPL_free(recv_elem); goto fn_exit; } - int vni_src = recv_elem->remote_info.vni_src; - int vni_dst = recv_elem->remote_info.vni_dst; - if (MPIDI_OFI_COMM(recv_elem->comm_ptr).enable_striping) { /* if striping enabled */ + int vni_src = info->vni_src; + int vni_dst = info->vni_dst; + if (MPIDI_OFI_COMM(comm).enable_striping) { /* if striping enabled */ if (recv_elem->cur_offset >= MPIDI_OFI_STRIPE_CHUNK_SIZE && bytesLeft > 0) { for (int nic = 0; nic < MPIDI_OFI_global.num_nics; nic++) { - int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_dst, nic); - remote_key = recv_elem->remote_info.rma_keys[nic]; + int ctx_idx = MPIDI_OFI_get_ctx_index(comm, vni_dst, nic); + remote_key = info->rma_keys[nic]; - bytesLeft = recv_elem->remote_info.msgsize - recv_elem->cur_offset; + bytesLeft = info->msgsize - recv_elem->cur_offset; if (bytesLeft <= 0) { break; } @@ -331,7 +324,7 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ bytesToGet, /* bytes */ NULL, /* descriptor */ - MPIDI_OFI_comm_to_phys(recv_elem->comm_ptr, recv_elem->remote_info.origin_rank, nic, vni_dst, vni_src), recv_rbase(recv_elem) + recv_elem->cur_offset, /* remote maddr */ + MPIDI_OFI_comm_to_phys(comm, info->origin_rank, nic, vni_dst, vni_src), recv_rbase(info) + recv_elem->cur_offset, /* remote maddr */ remote_key, /* Key */ (void *) &recv_elem->context), nic, /* Context */ rdma_readfrom, FALSE); @@ -343,15 +336,15 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques } } else { int nic = 0; - int ctx_idx = MPIDI_OFI_get_ctx_index(recv_elem->comm_ptr, vni_src, nic); - remote_key = recv_elem->remote_info.rma_keys[nic]; - MPIDI_OFI_cntr_incr(recv_elem->comm_ptr, vni_src, nic); + int ctx_idx = MPIDI_OFI_get_ctx_index(comm, vni_src, nic); + remote_key = info->rma_keys[nic]; + MPIDI_OFI_cntr_incr(comm, vni_src, nic); MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, /* endpoint */ (void *) ((char *) recv_buf + recv_elem->cur_offset), /* local buffer */ bytesToGet, /* bytes */ NULL, /* descriptor */ - MPIDI_OFI_comm_to_phys(recv_elem->comm_ptr, recv_elem->remote_info.origin_rank, nic, vni_src, vni_dst), /* Destination */ - recv_rbase(recv_elem) + recv_elem->cur_offset, /* remote maddr */ + MPIDI_OFI_comm_to_phys(comm, info->origin_rank, nic, vni_src, vni_dst), /* Destination */ + recv_rbase(info) + recv_elem->cur_offset, /* remote maddr */ remote_key, /* Key */ (void *) &recv_elem->context), vni_src, rdma_readfrom, /* Context */ FALSE); diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 5d2dfa45929..1014ad0e3d5 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -309,7 +309,8 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_cntr_set(int ctx_idx, int val) #define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL int MPIDI_OFI_retry_progress(void); int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); -int MPIDI_OFI_recv_huge_control(MPIDI_OFI_huge_remote_info_t * info); +int MPIDI_OFI_recv_huge_control(int comm_id, int rank, int tag, + MPIDI_OFI_huge_remote_info_t * info); int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index e1509e98131..e56f6c65e15 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -191,9 +191,9 @@ typedef struct { * if needed. */ enum MPIDI_OFI_req_kind kind; union { - struct fid_mr **huge_send_mrs; - MPIDI_OFI_huge_recv_t *recv_elem; - } huge_info; + struct fid_mr **send_mrs; + void *remote_info; + } huge; union { struct { void *buf; diff --git a/src/mpid/ch4/netmod/ofi/ofi_probe.h b/src/mpid/ch4/netmod/ofi/ofi_probe.h index 60b1ed16ac9..cae8b5fc38f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_probe.h +++ b/src/mpid/ch4/netmod/ofi/ofi_probe.h @@ -14,9 +14,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, int context_offset, MPIDI_av_entry_t * addr, int vni_src, int vni_dst, int *flag, - MPI_Status * status, - MPIR_Request ** message, - enum MPIDI_OFI_req_kind probe_kind) + MPI_Status * status, MPIR_Request ** message) { int mpi_errno = MPI_SUCCESS; fi_addr_t remote_proc; @@ -42,7 +40,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, } else { rreq = &r; } - MPIDI_OFI_REQUEST(rreq, kind) = probe_kind; + if (message) { + MPIDI_OFI_REQUEST(rreq, kind) = MPIDI_OFI_req_kind__mprobe; + } else { + MPIDI_OFI_REQUEST(rreq, kind) = MPIDI_OFI_req_kind__probe; + } + MPIDI_OFI_REQUEST(rreq, huge.remote_info) = NULL; rreq->comm = comm; MPIR_Comm_add_ref(comm); @@ -61,7 +64,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, msg.data = 0; uint64_t recv_flags = FI_PEEK | FI_COMPLETION; - if (probe_kind == MPIDI_OFI_req_kind__mprobe) { + if (message) { recv_flags |= FI_CLAIM; } MPIDI_OFI_CALL_RETURN(fi_trecvmsg(MPIDI_OFI_global.ctx[ctx_idx].rx, &msg, recv_flags), ofi_err); @@ -143,7 +146,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_improbe(int source, MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vni_dst); /* Set flags for mprobe peek, when ready */ mpi_errno = MPIDI_OFI_do_iprobe(source, tag, comm, context_offset, addr, vni_src, vni_dst, - flag, status, message, MPIDI_OFI_req_kind__mprobe); + flag, status, message); MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vni_dst); if (mpi_errno != MPI_SUCCESS) @@ -171,8 +174,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_iprobe(int source, } else { MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vni_dst); mpi_errno = MPIDI_OFI_do_iprobe(source, tag, comm, context_offset, addr, - vni_src, vni_dst, flag, status, NULL, - MPIDI_OFI_req_kind__probe); + vni_src, vni_dst, flag, status, NULL); MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vni_dst); } diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h index 40049aa2dc5..ea58cf6624b 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_recv.h +++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h @@ -156,6 +156,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, } *request = rreq; + MPIDI_OFI_REQUEST(rreq, kind) = MPIDI_OFI_req_kind__any; + if (!flags) { + MPIDI_OFI_REQUEST(rreq, huge.remote_info) = NULL; /* for huge recv remote info */ + } /* Calculate the correct NICs. */ sender_nic = MPIDI_OFI_multx_sender_nic_index(comm, comm->recvcontext_id, MPIR_Process.rank, diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 2f6f189d9d0..406c11502a3 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -313,7 +313,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou &huge_send_mrs[i], /* Out: memregion object */ NULL), mr_reg); /* In: context */ } - MPIDI_OFI_REQUEST(sreq, huge_info.huge_send_mrs) = huge_send_mrs; + MPIDI_OFI_REQUEST(sreq, huge.send_mrs) = huge_send_mrs; if (MPIDI_OFI_ENABLE_MR_PROV_KEY) { /* MR_BASIC */ for (int i = 0; i < num_nics; i++) { @@ -334,16 +334,16 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou MPIDI_OFI_send_control_t ctrl; ctrl.type = MPIDI_OFI_CTRL_HUGE; for (int i = 0; i < num_nics; i++) { - ctrl.u.huge.rma_keys[i] = rma_keys[i]; + ctrl.u.huge.info.rma_keys[i] = rma_keys[i]; } - ctrl.u.huge.tag = tag; - ctrl.u.huge.vni_src = vni_src; - ctrl.u.huge.vni_dst = vni_dst; - ctrl.u.huge.origin_rank = comm->rank; - ctrl.u.huge.send_buf = send_buf; - ctrl.u.huge.msgsize = data_sz; - ctrl.u.huge.comm_id = comm->context_id; - ctrl.u.huge.ackreq = sreq; + ctrl.u.huge.info.comm_id = comm->context_id; + ctrl.u.huge.info.tag = tag; + ctrl.u.huge.info.origin_rank = comm->rank; + ctrl.u.huge.info.vni_src = vni_src; + ctrl.u.huge.info.vni_dst = vni_dst; + ctrl.u.huge.info.send_buf = send_buf; + ctrl.u.huge.info.msgsize = data_sz; + ctrl.u.huge.info.ackreq = sreq; mpi_errno = MPIDI_NM_am_send_hdr(dst_rank, comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL, &ctrl, sizeof(ctrl), vni_src, vni_dst); diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index c2cefae3baa..b110b8a8139 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -382,13 +382,13 @@ typedef struct { } MPIDI_OFI_global_t; typedef struct { + int comm_id; int origin_rank; + int tag; MPIR_Request *ackreq; void *send_buf; size_t msgsize; - int comm_id; uint64_t rma_keys[MPIDI_OFI_MAX_NICS]; - int tag; int vni_src; int vni_dst; } MPIDI_OFI_huge_remote_info_t; @@ -396,7 +396,9 @@ typedef struct { typedef struct { int16_t type; union { - MPIDI_OFI_huge_remote_info_t huge; + struct { + MPIDI_OFI_huge_remote_info_t info; + } huge; struct { MPIR_Request *ackreq; } huge_ack; @@ -497,17 +499,11 @@ typedef struct MPIDI_OFI_huge_recv { char pad[MPIDI_REQUEST_HDR_SIZE]; struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ int event_id; /* fixed field, do not move */ - MPIDI_OFI_huge_remote_info_t remote_info; - bool peek; /* Flag to indicate whether this struct has been created to track an uncompleted peek - * operation. */ size_t cur_offset; size_t stripe_size; int chunks_outstanding; MPIR_Comm *comm_ptr; MPIR_Request *localreq; - struct fi_cq_tagged_entry wc; - struct MPIDI_OFI_huge_recv *next; /* Points to the next entry in the unexpected list - * (when in the unexpected list) */ } MPIDI_OFI_huge_recv_t; /* The list of posted huge receives that haven't been matched yet. These need @@ -518,16 +514,19 @@ typedef struct MPIDI_OFI_huge_recv_list { int comm_id; int rank; int tag; - MPIR_Request *rreq; + union { + MPIDI_OFI_huge_remote_info_t *info; /* ctrl list */ + MPIR_Request *rreq; /* recv list */ + } u; struct MPIDI_OFI_huge_recv_list *next; } MPIDI_OFI_huge_recv_list_t; /* Externs */ extern MPIDI_OFI_global_t MPIDI_OFI_global; -extern MPIDI_OFI_huge_recv_t *MPIDI_unexp_huge_recv_head; -extern MPIDI_OFI_huge_recv_t *MPIDI_unexp_huge_recv_tail; -extern MPIDI_OFI_huge_recv_list_t *MPIDI_posted_huge_recv_head; -extern MPIDI_OFI_huge_recv_list_t *MPIDI_posted_huge_recv_tail; +extern MPIDI_OFI_huge_recv_list_t *MPIDI_huge_ctrl_head; +extern MPIDI_OFI_huge_recv_list_t *MPIDI_huge_ctrl_tail; +extern MPIDI_OFI_huge_recv_list_t *MPIDI_huge_recv_head; +extern MPIDI_OFI_huge_recv_list_t *MPIDI_huge_recv_tail; extern MPIDI_OFI_capabilities_t MPIDI_OFI_caps_list[MPIDI_OFI_NUM_SETS]; diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c index ed2f77dd2ef..a574ae3d328 100644 --- a/src/mpid/ch4/netmod/ofi/util.c +++ b/src/mpid/ch4/netmod/ofi/util.c @@ -167,7 +167,10 @@ int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, break; case MPIDI_OFI_CTRL_HUGE: - mpi_errno = MPIDI_OFI_recv_huge_control(&(ctrlsend->u.huge)); + mpi_errno = MPIDI_OFI_recv_huge_control(ctrlsend->u.huge.info.comm_id, + ctrlsend->u.huge.info.origin_rank, + ctrlsend->u.huge.info.tag, + &(ctrlsend->u.huge.info)); break; default: From 81fa0adac0b81c5d377f6cbeca5a28666b1877e9 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 20:42:49 -0500 Subject: [PATCH 16/21] ch4/ofi: split get_huge_complete Split the huge recv completion into static function. --- src/mpid/ch4/netmod/ofi/ofi_huge.c | 54 +++++++++++++++++++----------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index fb75af40417..23ca888be07 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -30,6 +30,40 @@ static int get_huge(MPIR_Request * rreq) goto fn_exit; } +static int get_huge_complete(MPIDI_OFI_huge_recv_t * recv_elem) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_ENTER; + + MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(recv_elem->localreq, huge.remote_info); + + /* note: it's receiver ack sender */ + int vni_remote = info->vni_src; + int vni_local = info->vni_dst; + + struct fi_cq_tagged_entry wc; + wc.len = recv_elem->cur_offset; + wc.data = info->origin_rank; + wc.tag = info->tag; + MPIDI_OFI_recv_event(vni_local, &wc, recv_elem->localreq, MPIDI_OFI_EVENT_GET_HUGE); + + MPIDI_OFI_send_control_t ctrl; + ctrl.type = MPIDI_OFI_CTRL_HUGEACK; + ctrl.u.huge_ack.ackreq = info->ackreq; + mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, comm, + MPIDI_OFI_INTERNAL_HANDLER_CONTROL, + &ctrl, sizeof(ctrl), vni_local, vni_remote); + MPIR_ERR_CHECK(mpi_errno); + + MPL_free(info); + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + /* this function called by recv event of a huge message */ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) { @@ -280,27 +314,9 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques bytesLeft : MPIDI_OFI_global.max_msg_size; } if (bytesToGet == 0ULL && recv_elem->chunks_outstanding == 0) { - int vni = info->vni_dst; - struct fi_cq_tagged_entry wc; - wc.len = recv_elem->cur_offset; - wc.data = info->origin_rank; - wc.tag = info->tag; - MPIDI_OFI_recv_event(vni, &wc, recv_elem->localreq, recv_elem->event_id); - - MPIDI_OFI_send_control_t ctrl; - ctrl.type = MPIDI_OFI_CTRL_HUGEACK; - ctrl.u.huge_ack.ackreq = info->ackreq; - /* note: it's receiver ack sender */ - int vni_remote = info->vni_src; - int vni_local = info->vni_dst; - mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, comm, - MPIDI_OFI_INTERNAL_HANDLER_CONTROL, - &ctrl, sizeof(ctrl), vni_local, vni_remote); + mpi_errno = get_huge_complete(recv_elem); MPIR_ERR_CHECK(mpi_errno); - - MPL_free(info); MPL_free(recv_elem); - goto fn_exit; } From 88864733d6c2369bae6220a53d1968d100ebc631 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 21:06:11 -0500 Subject: [PATCH 17/21] ch4/ofi: handle when huge message sent to small buffer We need handle this case the the sender still receives the ack message. --- src/mpid/ch4/netmod/ofi/ofi_events.h | 4 +++ src/mpid/ch4/netmod/ofi/ofi_huge.c | 54 ++++++++++++++++++---------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h index faacbeb9b9f..f88a3060cdb 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.h +++ b/src/mpid/ch4/netmod/ofi/ofi_events.h @@ -53,6 +53,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vni, struct fi_cq_tagged_e size_t count; MPIR_FUNC_ENTER; + if (wc->tag & MPIDI_OFI_HUGE_SEND) { + mpi_errno = MPIDI_OFI_recv_huge_event(vni, wc, rreq); + goto fn_exit; + } rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); if (!rreq->status.MPI_ERROR) { rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index 23ca888be07..04ca13a2f1e 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -7,21 +7,42 @@ #include "ofi_impl.h" #include "ofi_events.h" +static int get_huge(MPIR_Request * rreq); +static int get_huge_complete(MPIR_Request * rreq); + static int get_huge(MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info); - MPIDI_OFI_huge_recv_t *recv_elem = NULL; + MPI_Aint cur_offset; + if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { + cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; + } else { + cur_offset = MPIDI_OFI_global.max_msg_size; + } + + MPI_Aint data_sz = MPIDI_OFI_REQUEST(rreq, util.iov.iov_len); + + if (data_sz < info->msgsize) { + rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE; + info->msgsize = data_sz; + } + + if (data_sz < cur_offset) { + /* huge message sent to small recv buffer */ + mpi_errno = get_huge_complete(rreq); + MPIR_ERR_CHECK(mpi_errno); + goto fn_exit; + } + + MPIDI_OFI_huge_recv_t *recv_elem = NULL; recv_elem = (MPIDI_OFI_huge_recv_t *) MPL_calloc(sizeof(*recv_elem), 1, MPL_MEM_BUFFER); MPIR_ERR_CHKANDJUMP(recv_elem == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); recv_elem->event_id = MPIDI_OFI_EVENT_GET_HUGE; recv_elem->localreq = rreq; - if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { - recv_elem->cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - recv_elem->cur_offset = MPIDI_OFI_global.max_msg_size; - } + recv_elem->cur_offset = cur_offset; + MPIDI_OFI_get_huge_event(info->vni_dst, NULL, (MPIR_Request *) recv_elem); fn_exit: @@ -30,27 +51,27 @@ static int get_huge(MPIR_Request * rreq) goto fn_exit; } -static int get_huge_complete(MPIDI_OFI_huge_recv_t * recv_elem) +static int get_huge_complete(MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; - MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(recv_elem->localreq, huge.remote_info); + MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info); /* note: it's receiver ack sender */ int vni_remote = info->vni_src; int vni_local = info->vni_dst; struct fi_cq_tagged_entry wc; - wc.len = recv_elem->cur_offset; + wc.len = info->msgsize; wc.data = info->origin_rank; wc.tag = info->tag; - MPIDI_OFI_recv_event(vni_local, &wc, recv_elem->localreq, MPIDI_OFI_EVENT_GET_HUGE); + MPIDI_OFI_recv_event(vni_local, &wc, rreq, MPIDI_OFI_EVENT_GET_HUGE); MPIDI_OFI_send_control_t ctrl; ctrl.type = MPIDI_OFI_CTRL_HUGEACK; ctrl.u.huge_ack.ackreq = info->ackreq; - mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, comm, + mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, rreq->comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL, &ctrl, sizeof(ctrl), vni_local, vni_remote); MPIR_ERR_CHECK(mpi_errno); @@ -72,7 +93,9 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque MPIR_FUNC_ENTER; bool ready_to_get = false; - if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { + if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_HUGE) { + /* huge send recved by a small buffer */ + } else if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { MPIR_Assert(wc->len == MPIDI_OFI_STRIPE_CHUNK_SIZE); } else { MPIR_Assert(wc->len == MPIDI_OFI_global.max_msg_size); @@ -287,11 +310,6 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques MPIR_FUNC_ENTER; void *recv_buf = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_base); - MPI_Aint data_sz = MPIDI_OFI_REQUEST(recv_elem->localreq, util.iov.iov_len); - if (info->msgsize > data_sz) { - recv_elem->localreq->status.MPI_ERROR = MPI_ERR_TRUNCATE; - info->msgsize = data_sz; - } if (MPIDI_OFI_COMM(comm).enable_striping) { /* Subtract one stripe_chunk_size because we send the first chunk via a regular message @@ -314,7 +332,7 @@ int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reques bytesLeft : MPIDI_OFI_global.max_msg_size; } if (bytesToGet == 0ULL && recv_elem->chunks_outstanding == 0) { - mpi_errno = get_huge_complete(recv_elem); + mpi_errno = get_huge_complete(recv_elem->localreq); MPIR_ERR_CHECK(mpi_errno); MPL_free(recv_elem); goto fn_exit; From 9e8c23d38b3fc02e4dd34e4b3295c09e8bc8e839 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 22:10:19 -0500 Subject: [PATCH 18/21] ch4/ofi: add vni assertions in control handler Add safety assertions to ensure consistentcy. --- src/mpid/ch4/netmod/ofi/util.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c index a574ae3d328..870e1a2362e 100644 --- a/src/mpid/ch4/netmod/ofi/util.c +++ b/src/mpid/ch4/netmod/ofi/util.c @@ -161,12 +161,15 @@ int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, } int local_vci = MPIDIG_AM_ATTR_DST_VCI(attr); + MPIR_AssertDeclValue(int remote_vci, MPIDIG_AM_ATTR_SRC_VCI(attr)); switch (ctrlsend->type) { case MPIDI_OFI_CTRL_HUGEACK: mpi_errno = MPIDI_OFI_dispatch_function(local_vci, NULL, ctrlsend->u.huge_ack.ackreq); break; case MPIDI_OFI_CTRL_HUGE: + MPIR_Assert(local_vci == ctrlsend->u.huge.info.vni_dst); + MPIR_Assert(remote_vci == ctrlsend->u.huge.info.vni_src); mpi_errno = MPIDI_OFI_recv_huge_control(ctrlsend->u.huge.info.comm_id, ctrlsend->u.huge.info.origin_rank, ctrlsend->u.huge.info.tag, From ddc0f9751007bd8cc4ab8d0586645aedc0d4bc2e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 26 Sep 2021 22:46:44 -0500 Subject: [PATCH 19/21] ch4/ofi: set recv data_sz correctly in the huge path The huge recv modifies the data_sz. Thus we need set MPIDI_OFI_REQUEST(rreq, util.iov.iov_len) earlier to prevent setting the wrong size. --- src/mpid/ch4/netmod/ofi/ofi_recv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h index ea58cf6624b..3a4d389af96 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_recv.h +++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h @@ -230,6 +230,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, } /* Read ordering unnecessary for context_id, so use relaxed load */ MPL_atomic_relaxed_store_int(&MPIDI_OFI_REQUEST(rreq, util_id), context_id); + MPIDI_OFI_REQUEST(rreq, util.iov.iov_base) = recv_buf; + MPIDI_OFI_REQUEST(rreq, util.iov.iov_len) = data_sz; if (unlikely(data_sz >= MPIDI_OFI_global.max_msg_size) && !MPIDI_OFI_COMM(comm).enable_striping) { MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_HUGE; @@ -244,8 +246,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, } else if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_PACK) MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV; - MPIDI_OFI_REQUEST(rreq, util.iov.iov_base) = recv_buf; - MPIDI_OFI_REQUEST(rreq, util.iov.iov_len) = data_sz; if (!flags) { MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, recv_buf, From ca8c3f3a688adce803178461306287231c34ecac Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 27 Sep 2021 00:11:12 -0500 Subject: [PATCH 20/21] ch4/ofi: fix usage of comm_id The comm_id in huge message path is, in fact, context_id of the communicator. We need use MPIR_Context_id_t as type. Direct applying mask to wc->tag won't get the corresponding context_id due to missing shift. Fix it by directly using rreq->comm->recvcontext_id. --- src/mpid/ch4/netmod/ofi/ofi_huge.c | 12 ++++++------ src/mpid/ch4/netmod/ofi/ofi_impl.h | 2 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c index 04ca13a2f1e..ef227f73681 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ b/src/mpid/ch4/netmod/ofi/ofi_huge.c @@ -110,7 +110,7 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque } else { /* Check for remote control info */ MPIDI_OFI_huge_recv_list_t *list_ptr; - int comm_id = comm_ptr->context_id; + MPIR_Context_id_t comm_id = comm_ptr->recvcontext_id; int rank = MPIDI_OFI_cqe_get_source(wc, false); int tag = (MPIDI_OFI_TAG_MASK & wc->tag); @@ -132,7 +132,7 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque if (!list_ptr) MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - list_ptr->comm_id = comm_ptr->context_id; + list_ptr->comm_id = comm_ptr->recvcontext_id; list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); list_ptr->tag = (MPIDI_OFI_TAG_MASK & wc->tag); list_ptr->u.rreq = rreq; @@ -153,7 +153,7 @@ int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque } /* This function is called when we receive a huge control message */ -int MPIDI_OFI_recv_huge_control(int comm_id, int rank, int tag, +int MPIDI_OFI_recv_huge_control(MPIR_Context_id_t comm_id, int rank, int tag, MPIDI_OFI_huge_remote_info_t * info_ptr) { int mpi_errno = MPI_SUCCESS; @@ -224,10 +224,10 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque * with this and return the size in that. */ LL_FOREACH(MPIDI_huge_ctrl_head, list_ptr) { /* FIXME: fix the type of comm_id */ - uint64_t context_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; + MPIR_Context_id_t comm_id = rreq->comm->recvcontext_id; int rank = MPIDI_OFI_cqe_get_source(wc, false); int tag = (int) (MPIDI_OFI_TAG_MASK & wc->tag); - if (list_ptr->comm_id == context_id && list_ptr->rank == rank && list_ptr->tag == tag) { + if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) { count = list_ptr->u.info->msgsize; found_msg = true; break; @@ -263,7 +263,7 @@ int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Reque (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(huge_list_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - huge_list_ptr->comm_id = MPIDI_OFI_CONTEXT_MASK & wc->tag; + huge_list_ptr->comm_id = rreq->comm->recvcontext_id; huge_list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; huge_list_ptr->u.rreq = rreq; diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 1014ad0e3d5..24d47384a4b 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -309,7 +309,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_cntr_set(int ctx_idx, int val) #define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL int MPIDI_OFI_retry_progress(void); int MPIDI_OFI_recv_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); -int MPIDI_OFI_recv_huge_control(int comm_id, int rank, int tag, +int MPIDI_OFI_recv_huge_control(MPIR_Context_id_t comm_id, int rank, int tag, MPIDI_OFI_huge_remote_info_t * info); int MPIDI_OFI_peek_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); int MPIDI_OFI_get_huge_event(int vni, struct fi_cq_tagged_entry *wc, MPIR_Request * req); diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index b110b8a8139..3d50f4f44cd 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -382,7 +382,7 @@ typedef struct { } MPIDI_OFI_global_t; typedef struct { - int comm_id; + MPIR_Context_id_t comm_id; int origin_rank; int tag; MPIR_Request *ackreq; @@ -511,7 +511,7 @@ typedef struct MPIDI_OFI_huge_recv { * data from the remote memory region and we need a way of matching up the * control messages with the "real" requests. */ typedef struct MPIDI_OFI_huge_recv_list { - int comm_id; + MPIR_Context_id_t comm_id; int rank; int tag; union { From 4e576000820321e94ec4320a61ea3e1f74025ed4 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 28 Sep 2021 17:46:22 -0500 Subject: [PATCH 21/21] test: threads/pt2pt/ssend.c --- test/mpi/threads/pt2pt/Makefile.am | 2 +- test/mpi/threads/pt2pt/ssend.c | 137 +++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 test/mpi/threads/pt2pt/ssend.c diff --git a/test/mpi/threads/pt2pt/Makefile.am b/test/mpi/threads/pt2pt/Makefile.am index 536a1b20150..aed8b7e4e2c 100644 --- a/test/mpi/threads/pt2pt/Makefile.am +++ b/test/mpi/threads/pt2pt/Makefile.am @@ -8,7 +8,7 @@ include $(top_srcdir)/Makefile_threads.mtest EXTRA_DIST = testlist noinst_PROGRAMS = threads threaded_sr alltoall sendselfth greq_wait greq_test \ - multisend multisend2 multisend3 multisend4 ibsend \ + multisend multisend2 multisend3 multisend4 ibsend ssend \ mt_sendrecv mt_bsendrecv mt_ssendrecv \ mt_isendirecv mt_ibsendirecv mt_issendirecv \ mt_sendrecv_huge mt_bsendrecv_huge mt_ssendrecv_huge \ diff --git a/test/mpi/threads/pt2pt/ssend.c b/test/mpi/threads/pt2pt/ssend.c new file mode 100644 index 00000000000..c4b165f5191 --- /dev/null +++ b/test/mpi/threads/pt2pt/ssend.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include +#include +#include "mpitest.h" +#include "mpithreadtest.h" + +#define MAX_COUNT 1024 * 1600 + +#define NUM_THREADS 4 +#define NUM_MSG_PER_THREAD 10 +#define NUM_CHECK 4 + +#define TOTAL NUM_THREADS * NUM_MSG_PER_THREAD + +int buf[TOTAL][MAX_COUNT]; +MPI_Request reqs[TOTAL]; +int counts[NUM_THREADS]; + +MPI_Comm comm = MPI_COMM_WORLD; +int tag = 1; + +static MTEST_THREAD_RETURN_TYPE do_ssend(void *arg) +{ + int id = (long) arg; + int base = id * NUM_MSG_PER_THREAD; + + for (int i = 0; i < NUM_MSG_PER_THREAD; i++) { + buf[base + i][0] = id; + int count = 1; + if (i % 2 == 0) { + count = MAX_COUNT; + } + MPI_Issend(buf[base + i], count, MPI_INT, 1, tag, comm, &reqs[base + i]); + } + return NULL; +} + +static MTEST_THREAD_RETURN_TYPE do_recv(void *arg) +{ + int id = (long) arg; + int base = id * NUM_MSG_PER_THREAD; + + for (int i = 0; i < NUM_CHECK; i++) { + MPI_Irecv(buf[base + i], MAX_COUNT, MPI_INT, 0, tag, comm, &reqs[base + i]); + } + MPI_Waitall(NUM_CHECK, reqs + base, MPI_STATUSES_IGNORE); + return NULL; +} + +int main(int argc, char *argv[]) +{ + int errs = 0; + + int provided; + MTest_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + if (provided != MPI_THREAD_MULTIPLE) { + printf("MPI_THREAD_MULTIPLE not supported by the MPI implementation\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if (size != 2) { + printf("This test require 2 processes\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + + if (rank == 0) { + for (int i = 0; i < NUM_THREADS; i++) { + /* Issend NUM_MSG_PER_THREAD * NUM_THREADS messages */ + MTest_Start_thread(do_ssend, (void *) (long) i); + } + MTest_Join_threads(); + + for (int i = 0; i < NUM_CHECK * NUM_THREADS; i++) { + int id, indx; + MPI_Waitany(TOTAL, reqs, &indx, MPI_STATUS_IGNORE); + id = indx / NUM_MSG_PER_THREAD; + printf(" - %d - %d send complete\n", id, indx); + counts[id]++; + } + + MPI_Send(counts, NUM_THREADS, MPI_INT, 1, tag + 1, comm); + + MPI_Barrier(comm); + + MPI_Waitall(TOTAL, reqs, MPI_STATUSES_IGNORE); + } else { +#if 0 + for (int i = 0; i < NUM_THREADS; i++) { + /* Receive NUM_CHECK * NUM_THREADS messages */ + MTest_Start_thread(do_recv, (void *) (long) i); + } + MTest_Join_threads(); + + for (int j = 0; j < NUM_THREADS; j++) { + for (int i = 0; i < NUM_CHECK; i++) { + int id = buf[j * NUM_MSG_PER_THREAD + i][0]; + counts[id]++; + } + } +#else + for (int i = 0; i < NUM_CHECK * NUM_THREADS; i++) { + MPI_Irecv(buf[i], MAX_COUNT, MPI_INT, 0, tag, comm, &reqs[i]); + } + MPI_Waitall(NUM_CHECK * NUM_THREADS, reqs, MPI_STATUSES_IGNORE); + for (int i = 0; i < NUM_CHECK * NUM_THREADS; i++) { + int id = buf[i][0]; + counts[id]++; + } +#endif + int recv_counts[TOTAL]; + MPI_Recv(recv_counts, NUM_THREADS, MPI_INT, 0, tag + 1, comm, MPI_STATUS_IGNORE); + for (int i = 0; i < NUM_THREADS; i++) { + if (counts[i] != recv_counts[i]) { + errs++; + } + printf("From thread %d, received %d messages, sender reported %d ssend completed\n", i, + counts[i], recv_counts[i]); + } + + MPI_Barrier(comm); + + for (int i = NUM_CHECK * NUM_THREADS; i < TOTAL; i++) { + MPI_Recv(&buf[i], MAX_COUNT, MPI_INT, 0, tag, comm, MPI_STATUS_IGNORE); + } + } + + MTest_Finalize(errs); + return 0; +}